Spaces:

bakrianoo
/

wikipedia-translator

Sleeping

App Files Files Community

wikipedia-translator / utils /wikipedia_extractor.py

bakrianoo

fix searching for wikipedia pages

49b1cdf 12 months ago

raw

history blame contribute delete

5.43 kB

	import wikipedia
	from typing import List, Dict, Any, Optional, Tuple
	import urllib.parse
	import requests
	import xml.etree.ElementTree as ET
	import re


	# Function to extract wiki id from a given url
	def extract_wiki_id(url: str) -> str:
	"""
	Extracts the wiki id from a given url.

	Args:
	url (str): The url to extract the wiki id from.

	Returns:
	str: The extracted wiki id.
	"""

	# validate the url is from wikipedia
	if "wikipedia.org" not in url:
	raise ValueError("URL is not from Wikipedia")

	# Parse the URL
	parsed_url = urllib.parse.urlparse(url)

	# Extract the path from the parsed URL
	path = parsed_url.path

	# Split the path into parts
	path_parts = path.split('/')

	# The wiki id is the last part of the path
	wiki_id = path_parts[-1]

	# Remove any query parameters
	if '?' in wiki_id:
	wiki_id = wiki_id.split('?')[0]

	# Remove any fragment identifiers
	if '#' in wiki_id:
	wiki_id = wiki_id.split('#')[0]

	# URL decode the wiki id to handle special characters
	wiki_id = urllib.parse.unquote(wiki_id)

	# Replace underscores with spaces as Wikipedia API expects spaces
	wiki_id = wiki_id.replace('_', ' ')

	return wiki_id

	# Function to get all details dictionary from a given wiki id
	def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
	"""
	Gets all details dictionary from a given wiki id or URL.

	Args:
	wiki_id_or_url (str): The wiki id or URL to get the details from.

	Returns:
	dict: The details dictionary or None if there was an error.
	"""
	try:
	# Check if input is a URL and extract wiki_id if it is
	if "wikipedia.org" in wiki_id_or_url:
	wiki_id = extract_wiki_id(wiki_id_or_url)
	else:
	wiki_id = wiki_id_or_url

	# Get the page object
	try:
	page = wikipedia.page(wiki_id, auto_suggest=False)
	except wikipedia.exceptions.PageError:
	# If direct page lookup fails, try search
	search_results = wikipedia.search(wiki_id)
	if not search_results:
	print(f"No results found for '{wiki_id}'")
	return None

	# Use the first search result
	try:
	page = wikipedia.page(search_results[0], auto_suggest=False)
	print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
	except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
	print(f"Error with search result: {e}")
	return None

	wiki_xml, has_error = get_wiki_xml(page.title)
	if has_error or not wiki_xml:
	print(f"Error fetching XML data: {has_error}")
	return None

	# Get the details dictionary
	details = {
	"title": page.title,
	"wiki_xml": wiki_xml,
	"pageid": page.pageid,
	"url": page.url,
	"content": page.content,
	"summary": page.summary,
	"images": page.images,
	"links": page.links,
	"categories": page.categories,
	"references": page.references,
	"sections": page.sections
	}

	return details

	except wikipedia.exceptions.DisambiguationError as e:
	print(f"Disambiguation error: {e}")
	print(f"Please specify one of the options above.")
	return None
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	return None

	# functio to get xml data from a given wiki id
	def get_wiki_xml(page_title):
	try:

	# MediaWiki API endpoint
	url = "https://en.wikipedia.org/w/api.php"

	# Parameters for XML format
	params = {
	"action": "query",
	"titles": page_title,
	"prop": "revisions",
	"rvprop": "content",
	"format": "xml"
	}

	# Make the request
	response = requests.get(url, params=params)
	xml_content = response.text

	return xml_content, None

	except wikipedia.exceptions.PageError:
	return None, {"error": f"Page '{page_title}' does not exist"}
	except wikipedia.exceptions.DisambiguationError as e:
	return None, {"error": f"Disambiguation error: {e}"}
	except Exception as e:
	return None, {"error": f"An error occurred: {str(e)}"}

	# function to split content into sections using === [SECTION NAME] === regex pattern
	def split_content_into_sections(content: str, content_format: str=None) -> List[str]:

	"""
	Splits the content into sections using the === [SECTION NAME] === regex pattern.

	Args:
	content (str): The content to split.
	content_format (str): The format to return the content in ("Plain Text" or "XML").

	Returns:
	dict: The sections dictionary.
	"""

	sections_dict = {}

	# Split the content into sections using regex
	sections = re.split(r'={2,}([^=]+)={2,}', content)

	# Iterate over the sections and add them to the dictionary
	for i in range(1, len(sections), 2):
	section_name = sections[i].strip()
	section_content = sections[i + 1].strip()
	sections_dict[section_name] = section_content

	return sections_dict