Spaces:

Madras1
/

Numidium

Sleeping

App Files Files Community

Madras1 commited on 14 days ago

Commit

eba56e9

verified ·

1 Parent(s): 4c2075e

Upload 42 files

Browse files

Files changed (1) hide show

app/services/ingestion/wikipedia.py +56 -36

app/services/ingestion/wikipedia.py CHANGED Viewed

@@ -13,6 +13,11 @@ class WikipediaScraper:
     BASE_URL = "https://pt.wikipedia.org"
     API_URL = "https://pt.wikipedia.org/w/api.php"
     def search(self, query: str, limit: int = 10) -> List[Dict]:
         """
         Busca artigos na Wikipedia
@@ -26,7 +31,12 @@ class WikipediaScraper:
                 "format": "json"
             }
-            response = requests.get(self.API_URL, params=params, timeout=10)
             response.raise_for_status()
             data = response.json()
@@ -47,43 +57,53 @@ class WikipediaScraper:
         """
         Busca informações completas de um artigo
         """
-        params = {
-            "action": "query",
-            "titles": title,
-            "prop": "extracts|pageimages|coordinates|categories",
-            "exintro": True,
-            "explaintext": True,
-            "pithumbsize": 300,
-            "format": "json"
-        }
-        response = requests.get(self.API_URL, params=params)
-        data = response.json()
-        pages = data.get("query", {}).get("pages", {})
-        for page_id, page in pages.items():
-            if page_id == "-1":
-                return None
-            result = {
-                "title": page.get("title"),
-                "extract": page.get("extract"),
-                "pageid": page.get("pageid"),
-                "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}",
-                "thumbnail": page.get("thumbnail", {}).get("source"),
-                "categories": [c["title"].replace("Categoria:", "")
-                              for c in page.get("categories", [])]
             }
-            # Coordenadas se disponíveis
-            if "coordinates" in page:
-                coords = page["coordinates"][0]
-                result["latitude"] = coords.get("lat")
-                result["longitude"] = coords.get("lon")
-            return result
-        return None
     def get_infobox(self, title: str) -> Dict:
         """
@@ -91,7 +111,7 @@ class WikipediaScraper:
         """
         try:
             url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
-            response = requests.get(url, timeout=10)
             soup = BeautifulSoup(response.text, "html.parser")
             infobox = soup.find("table", class_="infobox")

     BASE_URL = "https://pt.wikipedia.org"
     API_URL = "https://pt.wikipedia.org/w/api.php"
+    # User-Agent obrigatório para API da Wikipedia
+    HEADERS = {
+        "User-Agent": "NumidiumBot/1.0 (https://github.com/numidium; contact@numidium.app) Python/3.11"
+    }
     def search(self, query: str, limit: int = 10) -> List[Dict]:
         """
         Busca artigos na Wikipedia
                 "format": "json"
             }
+            response = requests.get(
+                self.API_URL,
+                params=params,
+                headers=self.HEADERS,
+                timeout=10
+            )
             response.raise_for_status()
             data = response.json()
         """
         Busca informações completas de um artigo
         """
+        try:
+            params = {
+                "action": "query",
+                "titles": title,
+                "prop": "extracts|pageimages|coordinates|categories",
+                "exintro": True,
+                "explaintext": True,
+                "pithumbsize": 300,
+                "format": "json"
             }
+            response = requests.get(
+                self.API_URL,
+                params=params,
+                headers=self.HEADERS,
+                timeout=10
+            )
+            response.raise_for_status()
+            data = response.json()
+            pages = data.get("query", {}).get("pages", {})
+            for page_id, page in pages.items():
+                if page_id == "-1":
+                    return None
+                result = {
+                    "title": page.get("title"),
+                    "extract": page.get("extract"),
+                    "pageid": page.get("pageid"),
+                    "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}",
+                    "thumbnail": page.get("thumbnail", {}).get("source"),
+                    "categories": [c["title"].replace("Categoria:", "")
+                                  for c in page.get("categories", [])]
+                }
+                # Coordenadas se disponíveis
+                if "coordinates" in page:
+                    coords = page["coordinates"][0]
+                    result["latitude"] = coords.get("lat")
+                    result["longitude"] = coords.get("lon")
+                return result
+            return None
+        except Exception as e:
+            print(f"Wikipedia article error: {e}")
+            return None
     def get_infobox(self, title: str) -> Dict:
         """
         """
         try:
             url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
+            response = requests.get(url, headers=self.HEADERS, timeout=10)
             soup = BeautifulSoup(response.text, "html.parser")
             infobox = soup.find("table", class_="infobox")