Spaces:
Sleeping
Sleeping
| import wikipedia | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import urllib.parse | |
| import requests | |
| import xml.etree.ElementTree as ET | |
| import re | |
| # Function to extract wiki id from a given url | |
| def extract_wiki_id(url: str) -> str: | |
| """ | |
| Extracts the wiki id from a given url. | |
| Args: | |
| url (str): The url to extract the wiki id from. | |
| Returns: | |
| str: The extracted wiki id. | |
| """ | |
| # validate the url is from wikipedia | |
| if "wikipedia.org" not in url: | |
| raise ValueError("URL is not from Wikipedia") | |
| # Parse the URL | |
| parsed_url = urllib.parse.urlparse(url) | |
| # Extract the path from the parsed URL | |
| path = parsed_url.path | |
| # Split the path into parts | |
| path_parts = path.split('/') | |
| # The wiki id is the last part of the path | |
| wiki_id = path_parts[-1] | |
| # Remove any query parameters | |
| if '?' in wiki_id: | |
| wiki_id = wiki_id.split('?')[0] | |
| # Remove any fragment identifiers | |
| if '#' in wiki_id: | |
| wiki_id = wiki_id.split('#')[0] | |
| # URL decode the wiki id to handle special characters | |
| wiki_id = urllib.parse.unquote(wiki_id) | |
| # Replace underscores with spaces as Wikipedia API expects spaces | |
| wiki_id = wiki_id.replace('_', ' ') | |
| return wiki_id | |
| # Function to get all details dictionary from a given wiki id | |
| def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Gets all details dictionary from a given wiki id or URL. | |
| Args: | |
| wiki_id_or_url (str): The wiki id or URL to get the details from. | |
| Returns: | |
| dict: The details dictionary or None if there was an error. | |
| """ | |
| try: | |
| # Check if input is a URL and extract wiki_id if it is | |
| if "wikipedia.org" in wiki_id_or_url: | |
| wiki_id = extract_wiki_id(wiki_id_or_url) | |
| else: | |
| wiki_id = wiki_id_or_url | |
| # Get the page object | |
| try: | |
| page = wikipedia.page(wiki_id, auto_suggest=False) | |
| except wikipedia.exceptions.PageError: | |
| # If direct page lookup fails, try search | |
| search_results = wikipedia.search(wiki_id) | |
| if not search_results: | |
| print(f"No results found for '{wiki_id}'") | |
| return None | |
| # Use the first search result | |
| try: | |
| page = wikipedia.page(search_results[0], auto_suggest=False) | |
| print(f"Using closest match: '{page.title}' for query '{wiki_id}'") | |
| except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e: | |
| print(f"Error with search result: {e}") | |
| return None | |
| wiki_xml, has_error = get_wiki_xml(page.title) | |
| if has_error or not wiki_xml: | |
| print(f"Error fetching XML data: {has_error}") | |
| return None | |
| # Get the details dictionary | |
| details = { | |
| "title": page.title, | |
| "wiki_xml": wiki_xml, | |
| "pageid": page.pageid, | |
| "url": page.url, | |
| "content": page.content, | |
| "summary": page.summary, | |
| "images": page.images, | |
| "links": page.links, | |
| "categories": page.categories, | |
| "references": page.references, | |
| "sections": page.sections | |
| } | |
| return details | |
| except wikipedia.exceptions.DisambiguationError as e: | |
| print(f"Disambiguation error: {e}") | |
| print(f"Please specify one of the options above.") | |
| return None | |
| except Exception as e: | |
| print(f"An error occurred: {str(e)}") | |
| return None | |
| # functio to get xml data from a given wiki id | |
| def get_wiki_xml(page_title): | |
| try: | |
| # MediaWiki API endpoint | |
| url = "https://en.wikipedia.org/w/api.php" | |
| # Parameters for XML format | |
| params = { | |
| "action": "query", | |
| "titles": page_title, | |
| "prop": "revisions", | |
| "rvprop": "content", | |
| "format": "xml" | |
| } | |
| # Make the request | |
| response = requests.get(url, params=params) | |
| xml_content = response.text | |
| return xml_content, None | |
| except wikipedia.exceptions.PageError: | |
| return None, {"error": f"Page '{page_title}' does not exist"} | |
| except wikipedia.exceptions.DisambiguationError as e: | |
| return None, {"error": f"Disambiguation error: {e}"} | |
| except Exception as e: | |
| return None, {"error": f"An error occurred: {str(e)}"} | |
| # function to split content into sections using === [SECTION NAME] === regex pattern | |
| def split_content_into_sections(content: str, content_format: str=None) -> List[str]: | |
| """ | |
| Splits the content into sections using the === [SECTION NAME] === regex pattern. | |
| Args: | |
| content (str): The content to split. | |
| content_format (str): The format to return the content in ("Plain Text" or "XML"). | |
| Returns: | |
| dict: The sections dictionary. | |
| """ | |
| sections_dict = {} | |
| # Split the content into sections using regex | |
| sections = re.split(r'={2,}([^=]+)={2,}', content) | |
| # Iterate over the sections and add them to the dictionary | |
| for i in range(1, len(sections), 2): | |
| section_name = sections[i].strip() | |
| section_content = sections[i + 1].strip() | |
| sections_dict[section_name] = section_content | |
| return sections_dict | |