Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

bakrianoo commited on May 16, 2025

Commit

49b1cdf

1 Parent(s): 06c3d9b

fix searching for wikipedia pages

Browse files

Files changed (1) hide show

utils/wikipedia_extractor.py +63 -29

utils/wikipedia_extractor.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import wikipedia
-from typing import List, Dict, Any
 import urllib.parse
 import requests
 import xml.etree.ElementTree as ET
@@ -42,44 +42,79 @@ def extract_wiki_id(url: str) -> str:
     if '#' in wiki_id:
         wiki_id = wiki_id.split('#')[0]
     return wiki_id
 # Function to get all details dictionary from a given wiki id
-def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
     """
-    Gets all details dictionary from a given wiki id.
     Args:
-        wiki_id (str): The wiki id to get the details from.
     Returns:
-        dict: The details dictionary.
     """
-    # Get the page object
-    page = wikipedia.page(wiki_id)
-    wiki_xml, has_error = get_wiki_xml(wiki_id)
-    if has_error or not wiki_xml:
-        print(f"Error fetching XML data: {has_error}")
         return None
-    # Get the details dictionary
-    details = {
-        "title": page.title,
-        "wiki_xml": wiki_xml,
-        "pageid": page.pageid,
-        "url": page.url,
-        "content": page.content,
-        "summary": page.summary,
-        "images": page.images,
-        "links": page.links,
-        "categories": page.categories,
-        "references": page.references,
-        "sections": page.sections
-    }
-    return details
 # functio to get xml data from a given wiki id
 def get_wiki_xml(page_title):
@@ -136,4 +171,3 @@ def split_content_into_sections(content: str, content_format: str=None) -> List[
         sections_dict[section_name] = section_content
     return sections_dict

 import wikipedia
+from typing import List, Dict, Any, Optional, Tuple
 import urllib.parse
 import requests
 import xml.etree.ElementTree as ET
     if '#' in wiki_id:
         wiki_id = wiki_id.split('#')[0]
+    # URL decode the wiki id to handle special characters
+    wiki_id = urllib.parse.unquote(wiki_id)
+    # Replace underscores with spaces as Wikipedia API expects spaces
+    wiki_id = wiki_id.replace('_', ' ')
     return wiki_id
 # Function to get all details dictionary from a given wiki id
+def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
     """
+    Gets all details dictionary from a given wiki id or URL.
     Args:
+        wiki_id_or_url (str): The wiki id or URL to get the details from.
     Returns:
+        dict: The details dictionary or None if there was an error.
     """
+    try:
+        # Check if input is a URL and extract wiki_id if it is
+        if "wikipedia.org" in wiki_id_or_url:
+            wiki_id = extract_wiki_id(wiki_id_or_url)
+        else:
+            wiki_id = wiki_id_or_url
+        # Get the page object
+        try:
+            page = wikipedia.page(wiki_id, auto_suggest=False)
+        except wikipedia.exceptions.PageError:
+            # If direct page lookup fails, try search
+            search_results = wikipedia.search(wiki_id)
+            if not search_results:
+                print(f"No results found for '{wiki_id}'")
+                return None
+            # Use the first search result
+            try:
+                page = wikipedia.page(search_results[0], auto_suggest=False)
+                print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
+            except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
+                print(f"Error with search result: {e}")
+                return None
+        wiki_xml, has_error = get_wiki_xml(page.title)
+        if has_error or not wiki_xml:
+            print(f"Error fetching XML data: {has_error}")
+            return None
+        # Get the details dictionary
+        details = {
+            "title": page.title,
+            "wiki_xml": wiki_xml,
+            "pageid": page.pageid,
+            "url": page.url,
+            "content": page.content,
+            "summary": page.summary,
+            "images": page.images,
+            "links": page.links,
+            "categories": page.categories,
+            "references": page.references,
+            "sections": page.sections
+        }
+        return details
+    except wikipedia.exceptions.DisambiguationError as e:
+        print(f"Disambiguation error: {e}")
+        print(f"Please specify one of the options above.")
+        return None
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
         return None
 # functio to get xml data from a given wiki id
 def get_wiki_xml(page_title):
         sections_dict[section_name] = section_content
     return sections_dict