Přeskočit na obsah

Wikipedie:GPT vypisovač shrnutí diskusí z interwiki

Z Wikipedie, otevřené encyklopedie

Následující program v Pythonu 3 je založený na umělé inteligenci. Projde interwiki zadaného článku Wikipedie (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní) a vypíše tabulku s přehledem a shrnutím všech diskusních vláken na všech interwiki daného článku.

Protože program přes API provolává model řady GPT, je potřeba mít na stránkách firmy OpenAI zakoupeno právo model takto používat (https://openai.com/api/).

import requests
import re
import csv
import json
from openai import OpenAI
from collections import defaultdict

client = OpenAI(api_key=my_api_key)   # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu

"""
Program shrne obsah diskusních stránek článku a jeho interwiki   
"""

# vstupy
article_title = "John Woo"  # jméno vstupního článku v uvozovkách  
language = "cs"  # zkratka Wikipedie, na které je umístěn
outlang = "Czech"  # jazyk, ve kterém požadujeme shrnutí

def get_interwiki(title, language='en', include_article_language = True):
    """Creates a list of interwikis for a given article"""
    url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={title}&lllimit=500"
    
    response = requests.get(url)
    data = response.json()
    interwiki = []
    if include_article_language:
        interwiki = [{'language': language, 'article': title}]
    page_id = list(data['query']['pages'].keys())[0]
        
    if 'langlinks' in data['query']['pages'][page_id]:
        langlinks = data['query']['pages'][page_id]['langlinks']
        for link in langlinks:
            interwiki.append({"language": link['lang'], "article": link['*']})
            
    return interwiki


def get_discussion_page_source(title, language):
    url = f"https://{language}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": f"Talk:{title}",
        "rvslots": "*",
        "rvprop": "content",
        "format": "json"
    }

    response = requests.get(url, params=params)

    if response.status_code != 200:
        return ""

    data = response.json()
    pages = data.get("query", {}).get("pages", {})
    for page_id, page_data in pages.items():
        if "revisions" in page_data:
            return page_data["revisions"][0]["slots"]["main"]["*"]
    return ""


def split_chapters(wikitext):
    """
    Splits a source of a Wikipedia article into chapters of == level 2 ==, creates a dictionary of them
    """
    # Splitting text into lines for easier processing
    lines = wikitext.split('\n')
    
    # Regex patterns to identify headings
    # heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)$')
    heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)')

    # Dictionary to store chapter information
    chapters = defaultdict(lambda: {"Text": "", "Length": 0, "Heading": ""})
    
    # Initial variables to keep track of current chapter and its content
    current_chapter = 0
    chapter_text = []
    chapter_heading = ""

    for line in lines:
        heading_match = heading_pattern.match(line)
        if heading_match:
            # If we reach a new heading, save the previous chapter's information
            if chapter_text or chapter_heading:
                chapters[current_chapter]["Text"] = "\n".join(chapter_text)
                chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
                chapters[current_chapter]["Heading"] = chapter_heading

            # Determine the chapter number by heading level
            level = len(heading_match.group(1))
            if level == 2:
                current_chapter += 1 if chapters[current_chapter]["Text"] else 0
                chapter_heading = heading_match.group(2).strip()
                chapter_text = [line]
            elif level > 2:
                chapter_text.append(line)

        else:
            # Add line to current chapter text
            chapter_text.append(line)

    # Save the last chapter's information
    if chapter_text or chapter_heading:
        chapters[current_chapter]["Text"] = "\n".join(chapter_text)
        chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
        chapters[current_chapter]["Heading"] = chapter_heading

    return dict(chapters)


def shrn_diskusi(chapters, outlang="Czech"):
    """
    Vstup: dictionary z programu split_chapters a outlang = jazyk, jakým má psát. Vytvoří seznam dictionaries
    "Item" (číslo vlákna od nuly), "Heading" (nadpis kapitoly), "Translated_heading" (dtto v outlang), 
    "Summary" (shrnutí), "Start_date" (datum zahájení diskuse) 
    """
    outlist = []
    json_pattern = re.compile(r'{.*?}')
    for i in range(len(chapters)):
        ch = chapters[i]
        tx = ch['Text'][:3000].replace('"', "'").replace('{', "").replace('}', "")
        hd = ch['Heading'].replace('"', "'").replace('[', "").replace(']', "")
        summary = chapter_summary(tx, hd, outlang)  # call GPT to translate and summarize
        
        json_match = re.search(r'{.*?}', summary, re.DOTALL)

        if json_match:
            json_substring = json_match.group()
            try:
                response = json.loads(json_substring)   # Parse the JSON substring to a Python dictionary
            except json.JSONDecodeError as e:
                print(str(i) + "*** Error: nenaparsovano ***" + json_substring + "\n")
                response = {}
        else:
            print(str(i) + "*** Error: nevznikl JSON ***" + summary + "\n")
            response = {}
        
        response['Item'] = i
        response['Heading_orig'] = ch['Heading']
        response['Text_orig'] = ch['Text']
        response['Summary_output'] = summary
        if len(str(ch['Heading']).strip()) == 0:
            response['Translated_heading'] == "N/A"
            
        outlist.append(response)
        
    return outlist    


def chapter_summary(wikitext, heading, outlang):
    """Z textu vlákna diskuse (wikitext) a jeho nadpisu (heading) v jazyce outlang vyrobí JSON,
    kde budou položky "Translation" (nadpis kapitoly v outlang), "Start_date" a "Summary" (shrnutí)"""
    #print("*** Heading *** "+heading)
    head = heading
    if head == "":
        head = "N/A"
    prompt = f"The goal is to process a Wikipedia discussion: translate its heading into the {outlang} language \
        and sumarize the discussion in the same {outlang} language. The original heading is {head}. \
        You will create a JSON (within curly brackets) with three items: \
          - 'Translated_heading', which is the heading in {outlang}, \
          - 'Start_date', which is the earliest date of a signature in the discussion (in {outlang}), and \
          - 'Summary', which is the summary of the content of the discussion, written in {outlang}. \
        If the content of the discussion are only templates and/or tables and/or bot edits, 'Translation' \
        is 'N/A' and 'Summary' is empty. \
        Do not use square brackets, curly brackets or apostrophes in the output except for the purpose of JSON. \
        The discussion is here: \
        {wikitext}"
  
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # levnější než gpt-4o
        max_tokens=4096,
        messages=[
                {
                  "role": "system",
                  "content": [
                      {
                      "type": "text",
                      "text": "You are a JSON generator. Your output can be only a JSON (within curly brackets {}). Otherwise write nothing."
                       }
                       ]
                },
                {
                  "role": "user",
                  "content": prompt
                },]
        )
    msgtext = response.choices[0].message.content
    return msgtext


def iw_summaries(article_title, language="cs", outlang="Czech"):
    """v jazyce outlang vypíše shrnutí diskusí na všech iw článku article_title z Wikipedie language"""
    iws = get_interwiki(title=article_title, language=language)
    outlist = []
    for iw in iws:
        disc = get_discussion_page_source(title=iw["article"], language=iw["language"])
        if len(disc) > 30:
            diskuse = split_chapters(get_discussion_page_source(title=iw["article"], language=iw["language"]))
            data = shrn_diskusi(diskuse, outlang=outlang)
            for item in data:
                item["Language"] = iw["language"]
                if item["Heading_orig"] == "":
                    item["Article_name"] = "[[:"+iw["language"]+":Talk:"+iw["article"]+"]]"
                else:    
                    item["Article_name"] = "[[:"+iw["language"]+":Talk:"+iw["article"]+"#"+item["Heading_orig"]+"]]"
            outlist += data
    return outlist    


def dictlist_to_wikitable(dict_list, drop_columns=[]):
    """
    Converts a list of dictionaries to a Wikimedia wiki table markup.
    
    Args:
        dict_list (list): A list of dictionaries, all with the same structure.
        drop_columns: list of columns to drop from the output table
        
    Returns:
        str: The Wikimedia wiki table markup.
    """
    # Get the keys from the first dictionary
    keys = list(dict_list[0].keys())
    keys = [key for key in keys if key not in drop_columns]  # drop those not reported
    
    # Start the table markup
    table = "{| class=\"wikitable sortable mw-collapsible mw-collapsed\"\n"
    
    # Add the header row
    table += "! " + " !! ".join(keys) + "\n"
    
    # Add the data rows
    for d in dict_list:
        row = "|-\n| "
        for k in keys:
            try:
                value = str(d[k])
            except Exception:
                value = ""
            if not k in drop_columns:    
                row += value + " || "
                
        table += row[:-4] + "\n"  # Remove the last " || "
    
    # Close the table markup
    table += "|}"
    
    return table


diskuse = iw_summaries(article_title, language=language, outlang=outlang)
print(dictlist_to_wikitable(diskuse, drop_columns=["Heading_orig", "Text_orig", "Summary_output"]))