Spaces:
Running
Running
fix searching for wikipedia pages
Browse files- utils/wikipedia_extractor.py +63 -29
utils/wikipedia_extractor.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import wikipedia
|
| 2 |
-
from typing import List, Dict, Any
|
| 3 |
import urllib.parse
|
| 4 |
import requests
|
| 5 |
import xml.etree.ElementTree as ET
|
|
@@ -42,44 +42,79 @@ def extract_wiki_id(url: str) -> str:
|
|
| 42 |
if '#' in wiki_id:
|
| 43 |
wiki_id = wiki_id.split('#')[0]
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
return wiki_id
|
| 46 |
|
| 47 |
# Function to get all details dictionary from a given wiki id
|
| 48 |
-
def get_wiki_details(
|
| 49 |
"""
|
| 50 |
-
Gets all details dictionary from a given wiki id.
|
| 51 |
|
| 52 |
Args:
|
| 53 |
-
|
| 54 |
|
| 55 |
Returns:
|
| 56 |
-
dict: The details dictionary.
|
| 57 |
"""
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
return None
|
| 66 |
-
|
| 67 |
-
# Get the details dictionary
|
| 68 |
-
details = {
|
| 69 |
-
"title": page.title,
|
| 70 |
-
"wiki_xml": wiki_xml,
|
| 71 |
-
"pageid": page.pageid,
|
| 72 |
-
"url": page.url,
|
| 73 |
-
"content": page.content,
|
| 74 |
-
"summary": page.summary,
|
| 75 |
-
"images": page.images,
|
| 76 |
-
"links": page.links,
|
| 77 |
-
"categories": page.categories,
|
| 78 |
-
"references": page.references,
|
| 79 |
-
"sections": page.sections
|
| 80 |
-
}
|
| 81 |
-
|
| 82 |
-
return details
|
| 83 |
|
| 84 |
# functio to get xml data from a given wiki id
|
| 85 |
def get_wiki_xml(page_title):
|
|
@@ -136,4 +171,3 @@ def split_content_into_sections(content: str, content_format: str=None) -> List[
|
|
| 136 |
sections_dict[section_name] = section_content
|
| 137 |
|
| 138 |
return sections_dict
|
| 139 |
-
|
|
|
|
| 1 |
import wikipedia
|
| 2 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 3 |
import urllib.parse
|
| 4 |
import requests
|
| 5 |
import xml.etree.ElementTree as ET
|
|
|
|
| 42 |
if '#' in wiki_id:
|
| 43 |
wiki_id = wiki_id.split('#')[0]
|
| 44 |
|
| 45 |
+
# URL decode the wiki id to handle special characters
|
| 46 |
+
wiki_id = urllib.parse.unquote(wiki_id)
|
| 47 |
+
|
| 48 |
+
# Replace underscores with spaces as Wikipedia API expects spaces
|
| 49 |
+
wiki_id = wiki_id.replace('_', ' ')
|
| 50 |
+
|
| 51 |
return wiki_id
|
| 52 |
|
| 53 |
# Function to get all details dictionary from a given wiki id
|
| 54 |
+
def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
|
| 55 |
"""
|
| 56 |
+
Gets all details dictionary from a given wiki id or URL.
|
| 57 |
|
| 58 |
Args:
|
| 59 |
+
wiki_id_or_url (str): The wiki id or URL to get the details from.
|
| 60 |
|
| 61 |
Returns:
|
| 62 |
+
dict: The details dictionary or None if there was an error.
|
| 63 |
"""
|
| 64 |
+
try:
|
| 65 |
+
# Check if input is a URL and extract wiki_id if it is
|
| 66 |
+
if "wikipedia.org" in wiki_id_or_url:
|
| 67 |
+
wiki_id = extract_wiki_id(wiki_id_or_url)
|
| 68 |
+
else:
|
| 69 |
+
wiki_id = wiki_id_or_url
|
| 70 |
+
|
| 71 |
+
# Get the page object
|
| 72 |
+
try:
|
| 73 |
+
page = wikipedia.page(wiki_id, auto_suggest=False)
|
| 74 |
+
except wikipedia.exceptions.PageError:
|
| 75 |
+
# If direct page lookup fails, try search
|
| 76 |
+
search_results = wikipedia.search(wiki_id)
|
| 77 |
+
if not search_results:
|
| 78 |
+
print(f"No results found for '{wiki_id}'")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
# Use the first search result
|
| 82 |
+
try:
|
| 83 |
+
page = wikipedia.page(search_results[0], auto_suggest=False)
|
| 84 |
+
print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
|
| 85 |
+
except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
|
| 86 |
+
print(f"Error with search result: {e}")
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
wiki_xml, has_error = get_wiki_xml(page.title)
|
| 90 |
+
if has_error or not wiki_xml:
|
| 91 |
+
print(f"Error fetching XML data: {has_error}")
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
# Get the details dictionary
|
| 95 |
+
details = {
|
| 96 |
+
"title": page.title,
|
| 97 |
+
"wiki_xml": wiki_xml,
|
| 98 |
+
"pageid": page.pageid,
|
| 99 |
+
"url": page.url,
|
| 100 |
+
"content": page.content,
|
| 101 |
+
"summary": page.summary,
|
| 102 |
+
"images": page.images,
|
| 103 |
+
"links": page.links,
|
| 104 |
+
"categories": page.categories,
|
| 105 |
+
"references": page.references,
|
| 106 |
+
"sections": page.sections
|
| 107 |
+
}
|
| 108 |
|
| 109 |
+
return details
|
| 110 |
+
|
| 111 |
+
except wikipedia.exceptions.DisambiguationError as e:
|
| 112 |
+
print(f"Disambiguation error: {e}")
|
| 113 |
+
print(f"Please specify one of the options above.")
|
| 114 |
+
return None
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"An error occurred: {str(e)}")
|
| 117 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# functio to get xml data from a given wiki id
|
| 120 |
def get_wiki_xml(page_title):
|
|
|
|
| 171 |
sections_dict[section_name] = section_content
|
| 172 |
|
| 173 |
return sections_dict
|
|
|