Wikipedie:GPT vypisovač informací a referencí z interwiki
Vzhled
Následující program v Pythonu 3 je založený na umělé inteligenci. Zdáte článek české Wikipedie, který má interwiki (jeho jméno zapište přímo do programu, jde o "holý" program bez uživatelského rozhraní). Program seznam informací, které v interwiki nalezl, a případné reference. Vyžaduje přístup k placeným API firmy OpenAI, což je potřeba si zakoupit na jejich webu.
Ukázky výstupu jsou na Diskuse:Acidifikace a Diskuse:Agama australská.
import requests
import re
import pprint
import csv
from openai import OpenAI
from io import StringIO
from collections import defaultdict
client = OpenAI(api_key=my_api_key) # za my_api_key dosadit klíč od firmy OpenAI - lze ho zakoupit na jejich webu
"""
Program najde n nejdelších interwiki zadaného článku. Z nich vypíše informace,
zhodnotí je podle důležitosti vzhledem k tématu článku,
uvede k nim příslušné zdroje, pokud tam jsou, a kapitolu, kde se vyskytují.
Výpis je v podobě wikitabulky, kterou si můžete vložit na pískoviště nebo
na diskusní stránku analyzovaného článku.
"""
# vstupy
article_title = "Jan Čep" # jméno vstupního článku v uvozovkách
language = "cs" # Wikipedie, na které je
n = 3 # počet největších interwiki článků, které se mají zohlednit
output_unsourced_facts = False # zda se mají vypsat i nezdrojované informace (True) nebo ne (False)
output_all_columns = False # zda se mají vypsat všechny sloupce (True) nebo jen nejdůležitější (False)
def get_wikipedia_source(article_title, language="cs"):
"""Gets the source code of a Wikipedia article"""
# Define the endpoint and parameters
endpoint = f"https://{language}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "revisions",
"titles": article_title,
"rvprop": "content",
"rvslots": "main"
}
# Make the request to the Wikipedia API
response = requests.get(endpoint, params=params)
if response.status_code != 200:
raise Exception(f"Error fetching data from Wikipedia API: {response.status_code}")
data = response.json()
# Extract the page ID, as the structure of the response contains dynamic page IDs
pages = data.get("query", {}).get("pages", {})
if not pages:
raise Exception("No pages found or an error occurred.")
page_id = next(iter(pages)) # Get the first (and likely only) page ID key
# Extract the content of the page
page = pages[page_id]
revisions = page.get("revisions", [])
if not revisions:
raise Exception("No revisions found for this page.")
content = revisions[0].get("slots", {}).get("main", {}).get("*", "")
return content
def split_chapters(wikitext):
"""
Splits a source of a Wikipedia article into chapters of == level 2 ==, creates a dictionary of them
"""
# Splitting text into lines for easier processing
lines = wikitext.split('\n')
# Regex patterns to identify headings
heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)$')
heading_pattern = re.compile(r'^(=+)\s*(.*?)\s*(=+)')
# Dictionary to store chapter information
chapters = defaultdict(lambda: {"Text": "", "Length": 0, "Heading": ""})
# Initial variables to keep track of current chapter and its content
current_chapter = 0
chapter_text = []
chapter_heading = ""
for line in lines:
heading_match = heading_pattern.match(line)
if heading_match:
# If we reach a new heading, save the previous chapter's information
if chapter_text or chapter_heading:
chapters[current_chapter]["Text"] = "\n".join(chapter_text)
chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
chapters[current_chapter]["Heading"] = chapter_heading
# Determine the chapter number by heading level
level = len(heading_match.group(1))
if level == 2:
current_chapter += 1 if chapters[current_chapter]["Text"] else 0
chapter_heading = heading_match.group(2).strip()
chapter_text = [line]
elif level > 2:
chapter_text.append(line)
else:
# Add line to current chapter text
chapter_text.append(line)
# Save the last chapter's information
if chapter_text or chapter_heading:
chapters[current_chapter]["Text"] = "\n".join(chapter_text)
chapters[current_chapter]["Length"] = len(chapters[current_chapter]["Text"])
chapters[current_chapter]["Heading"] = chapter_heading
return dict(chapters)
def unapostropher(retezec):
"""Stripne řetězec a pokud je uzavřen v apostrofech, odebere je"""
s = retezec.strip()
if s.startswith("'") and s.endswith("'"):
# Strip one apostrophe from both ends
return s[1:-1]
else:
return s
def reftemplate(chapters, article, language):
"""Projede kapitoly wikičlánku a vytvoří seznam obsažených faktů a jejich referencí
argumenty: dictionary kapitol a název článku"""
prompt = """Your task is to extract facts and their references from a chapter of a Wikipedia article.
Try to extract as many facts as possible. If the chapter has no fluent text (it contains tables, pictures,
lists, external links...), you will print nothing.
You will create a tab delimited table. You will create as many rows as needed.
"""
prompt += f"The first column is the short statement of the fact you have found. Write in the language \
of the article, that is {language}. Be succinct and use as much the original language of the text \
as possible. Write in complete self-sufficient sentences."
prompt += """
The second column is the same but in Czech language. Translate simply the fist column.
The third column is the reference or references used to support the fact in the text. Copy it as is including
the <ref> tags and/or citation templates like {{Harv|Blust|1999|p=12}}. If the fact is not referenced,
left the column empty.
The fourth column is the importance of the fact regarding the theme of the article: "Very important" if it should
be mentioned even in a very short article about the subject. "Important" if it should be mentioned in an average
Wikipedia article. "Useful" if it belongs in a very detailed account only. "Unimportant" if it has only indirect
relevance.
Do not comment the output nor the references.
"""
prompt += f"\nThe theme of the article is {article}, its language is {language}. The text of the chapter follows:\n"
vystup = ""
for i in range(len(chapters)): # prochází kapitoly
chap = chapters[i]
response = client.chat.completions.create(
model="gpt-4o",
max_tokens=4096,
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": """You are a csv generator. Your output can be only a tab separated
table without a heading and without a type description starting from
the first line on. Otherwise write nothing."""
}
]
},
{
"role": "user",
"content": prompt + chap["Text"]
},]
)
vystup = response.choices[0].message.content
csv_file = StringIO(vystup) # Convert the CSV content to a file-like object
# Read the CSV file
csv_reader = csv.reader(csv_file, delimiter='\t')
out, cnt = {}, 0
for row in csv_reader:
errorfree = True
try:
fact = unapostropher(row[0].strip())
fact_cs = unapostropher(row[1].strip())
ref = unapostropher(row[2].strip())
importance = unapostropher(row[3].strip()).lower().capitalize()
# Check values and set as "Error" if they're not valid
if importance not in {"Very important", "Important", "Useful", "Unimportant"}:
importance = "Error"
errorfree = False
# Create the dictionary entry
if errorfree:
out[cnt] = {
"Fact": fact,
"Fact_cs": fact_cs,
"Reference": ref,
"Importance": importance
}
cnt += 1
else:
print("Error Found in " + chap["Heading"])
except (ValueError, IndexError): # Ignore malformed lines
continue
chap["Facts"] = out
return chapters
def factlist(facts, article_title, language):
"""Transforms the facts into a list of dictionaties and possibly prints them"""
faclist = []
for i in range(len(facts)):
chap = facts[i]
if not chap["Facts"] == {}:
for j in range(len(chap["Facts"])):
f = chap["Facts"][j] # A single fact, at last
f["Chapter_number"] = i
f["Fact_number"] = j
f["Heading"] = chap["Heading"]
f["Chapter_length"] = chap["Length"]
f["Article_title"] = article_title
f["Language"] = language
faclist.append(f)
return faclist
def list_facts(article_title, language):
"""Vypíše fakta a jejich zdroje a kapitoly ze zadaného článku Wikipedie"""
wikitext = get_wikipedia_source(article_title, language) # Gets the article
chapters = split_chapters(wikitext) # Splits the source by chapters
facts = reftemplate(chapters, article_title, language) # Selects the facts
vystup = factlist(facts, article_title, language) # Transform the output to a simpler structure
return vystup
def get_article_length(article_name, language='en'):
"""How long is a given article"""
url = f"https://{language}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={article_name}&exlimit=1&explaintext"
response = requests.get(url)
data = response.json()
page_id = list(data['query']['pages'].keys())[0]
if 'extract' in data['query']['pages'][page_id]:
article_text = data['query']['pages'][page_id]['extract']
article_length = len(article_text)
return article_length
return None
def get_interwiki(article_name, language='en', n=3, include_article_language = False):
"""Creates a list of n interwikis for a given article which have the most developed articles"""
url = f"https://{language}.wikipedia.org/w/api.php?action=query&prop=langlinks&format=json&titles={article_name}&lllimit=500"
response = requests.get(url)
data = response.json()
interwiki = []
if include_article_language:
interwiki = [{'language': language, 'article': article_name,
'length': get_article_length(article_name, language)}]
page_id = list(data['query']['pages'].keys())[0]
if 'langlinks' in data['query']['pages'][page_id]:
langlinks = data['query']['pages'][page_id]['langlinks']
for link in langlinks:
interwiki.append({"language": link['lang'], "article": link['*'],
"length": get_article_length(link['*'], link['lang'])})
return sorted(interwiki, key=lambda x: x['length'], reverse=True)[:n] # only n iw with longest articles
def fakta(article_title, language="cs", n=3):
"""Pro daný článek vypíše fakta z n nejrozsáhlejších interwiki"""
interwiki = get_interwiki(article_title, language, n)
fakta = []
for iw in interwiki:
fakta += list_facts(iw['article'], iw['language'])
return fakta
def dictlist_to_wikitable(dict_list, unsourced=True, drop_columns=[]):
"""
Converts a list of dictionaries to a Wikimedia wiki table markup.
Args:
dict_list (list): A list of dictionaries, all with the same structure.
unsourced: Whether to keep unsourced facts (column Reference is empty)
drop_columns: list of columns to drop from the output table
Returns:
str: The Wikimedia wiki table markup.
"""
# Get the keys from the first dictionary
keys = list(dict_list[0].keys())
keys = [key for key in keys if key not in drop_columns] # drop those not reported
# Start the table markup
table = "{| class=\"wikitable sortable\"\n"
# Add the header row
table += "! " + " !! ".join(keys) + "\n"
# Add the data rows
for d in dict_list:
row = "|-\n| "
for k in keys:
value = str(d[k])
if k.lower() == "reference" and len(value) > 0:
value = "<nowiki>{}</nowiki>".format(value)
if not k in drop_columns:
row += value + " || "
if unsourced or len(d["Reference"]) > 0:
table += row[:-4] + "\n" # Remove the last " || "
# Close the table markup
table += "|}"
return table
f = fakta(article_title, language, n)
# Print out the facts as a wikitable
drops=["Fact", "Chapter_number", "Fact_number", "Chapter_length", "Article_title"] # columns to drop
if output_all_columns: # output_all_columns = zda se mají vypsat všechny sloupce
drops=[]
print(dictlist_to_wikitable(f, unsourced=output_unsourced_facts, drop_columns=drops))