Spaces:

Madras1
/

Numidium

Sleeping

App Files Files Community

Madras1 commited on 15 days ago

Commit

4c2075e

verified ·

1 Parent(s): c4a572a

Upload 42 files

Browse files

Files changed (2) hide show

app/services/ingestion/wikipedia.py +48 -39
requirements.txt +0 -1

app/services/ingestion/wikipedia.py CHANGED Viewed

@@ -17,26 +17,31 @@ class WikipediaScraper:
         """
         Busca artigos na Wikipedia
         """
-        params = {
-            "action": "query",
-            "list": "search",
-            "srsearch": query,
-            "srlimit": limit,
-            "format": "json"
-        }
-        response = requests.get(self.API_URL, params=params)
-        data = response.json()
-        results = []
-        for item in data.get("query", {}).get("search", []):
-            results.append({
-                "title": item["title"],
-                "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
-                "pageid": item["pageid"]
-            })
-        return results
     def get_article(self, title: str) -> Optional[Dict]:
         """
@@ -84,26 +89,30 @@ class WikipediaScraper:
         """
         Tenta extrair dados estruturados do infobox de um artigo
         """
-        url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
-        response = requests.get(url)
-        soup = BeautifulSoup(response.text, "lxml")
-        infobox = soup.find("table", class_="infobox")
-        if not infobox:
             return {}
-        data = {}
-        for row in infobox.find_all("tr"):
-            header = row.find("th")
-            cell = row.find("td")
-            if header and cell:
-                key = header.get_text(strip=True)
-                value = cell.get_text(strip=True)
-                # Clean up the value
-                value = re.sub(r'\[\d+\]', '', value)  # Remove references
-                data[key] = value
-        return data
     def scrape_person(self, name: str) -> Optional[Dict]:
         """

         """
         Busca artigos na Wikipedia
         """
+        try:
+            params = {
+                "action": "query",
+                "list": "search",
+                "srsearch": query,
+                "srlimit": limit,
+                "format": "json"
+            }
+            response = requests.get(self.API_URL, params=params, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+            results = []
+            for item in data.get("query", {}).get("search", []):
+                results.append({
+                    "title": item["title"],
+                    "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
+                    "pageid": item["pageid"]
+                })
+            return results
+        except Exception as e:
+            print(f"Wikipedia search error: {e}")
+            return []
     def get_article(self, title: str) -> Optional[Dict]:
         """
         """
         Tenta extrair dados estruturados do infobox de um artigo
         """
+        try:
+            url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
+            response = requests.get(url, timeout=10)
+            soup = BeautifulSoup(response.text, "html.parser")
+            infobox = soup.find("table", class_="infobox")
+            if not infobox:
+                return {}
+            data = {}
+            for row in infobox.find_all("tr"):
+                header = row.find("th")
+                cell = row.find("td")
+                if header and cell:
+                    key = header.get_text(strip=True)
+                    value = cell.get_text(strip=True)
+                    # Clean up the value
+                    value = re.sub(r'\[\d+\]', '', value)  # Remove references
+                    data[key] = value
+            return data
+        except Exception as e:
+            print(f"Infobox error: {e}")
             return {}
     def scrape_person(self, name: str) -> Optional[Dict]:
         """

requirements.txt CHANGED Viewed

@@ -5,7 +5,6 @@ pydantic==2.5.2
 pydantic-settings==2.1.0
 requests==2.31.0
 beautifulsoup4==4.12.2
-lxml==4.9.3
 httpx==0.25.2
 python-multipart==0.0.6
 aiohttp==3.9.1

 pydantic-settings==2.1.0
 requests==2.31.0
 beautifulsoup4==4.12.2
 httpx==0.25.2
 python-multipart==0.0.6
 aiohttp==3.9.1