Upload 42 files
Browse files- app/services/ingestion/wikipedia.py +48 -39
- requirements.txt +0 -1
app/services/ingestion/wikipedia.py
CHANGED
|
@@ -17,26 +17,31 @@ class WikipediaScraper:
|
|
| 17 |
"""
|
| 18 |
Busca artigos na Wikipedia
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
results
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def get_article(self, title: str) -> Optional[Dict]:
|
| 42 |
"""
|
|
@@ -84,26 +89,30 @@ class WikipediaScraper:
|
|
| 84 |
"""
|
| 85 |
Tenta extrair dados estruturados do infobox de um artigo
|
| 86 |
"""
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
return {}
|
| 94 |
-
|
| 95 |
-
data = {}
|
| 96 |
-
for row in infobox.find_all("tr"):
|
| 97 |
-
header = row.find("th")
|
| 98 |
-
cell = row.find("td")
|
| 99 |
-
if header and cell:
|
| 100 |
-
key = header.get_text(strip=True)
|
| 101 |
-
value = cell.get_text(strip=True)
|
| 102 |
-
# Clean up the value
|
| 103 |
-
value = re.sub(r'\[\d+\]', '', value) # Remove references
|
| 104 |
-
data[key] = value
|
| 105 |
-
|
| 106 |
-
return data
|
| 107 |
|
| 108 |
def scrape_person(self, name: str) -> Optional[Dict]:
|
| 109 |
"""
|
|
|
|
| 17 |
"""
|
| 18 |
Busca artigos na Wikipedia
|
| 19 |
"""
|
| 20 |
+
try:
|
| 21 |
+
params = {
|
| 22 |
+
"action": "query",
|
| 23 |
+
"list": "search",
|
| 24 |
+
"srsearch": query,
|
| 25 |
+
"srlimit": limit,
|
| 26 |
+
"format": "json"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
response = requests.get(self.API_URL, params=params, timeout=10)
|
| 30 |
+
response.raise_for_status()
|
| 31 |
+
data = response.json()
|
| 32 |
+
|
| 33 |
+
results = []
|
| 34 |
+
for item in data.get("query", {}).get("search", []):
|
| 35 |
+
results.append({
|
| 36 |
+
"title": item["title"],
|
| 37 |
+
"snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
|
| 38 |
+
"pageid": item["pageid"]
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
return results
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Wikipedia search error: {e}")
|
| 44 |
+
return []
|
| 45 |
|
| 46 |
def get_article(self, title: str) -> Optional[Dict]:
|
| 47 |
"""
|
|
|
|
| 89 |
"""
|
| 90 |
Tenta extrair dados estruturados do infobox de um artigo
|
| 91 |
"""
|
| 92 |
+
try:
|
| 93 |
+
url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
|
| 94 |
+
response = requests.get(url, timeout=10)
|
| 95 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 96 |
+
|
| 97 |
+
infobox = soup.find("table", class_="infobox")
|
| 98 |
+
if not infobox:
|
| 99 |
+
return {}
|
| 100 |
+
|
| 101 |
+
data = {}
|
| 102 |
+
for row in infobox.find_all("tr"):
|
| 103 |
+
header = row.find("th")
|
| 104 |
+
cell = row.find("td")
|
| 105 |
+
if header and cell:
|
| 106 |
+
key = header.get_text(strip=True)
|
| 107 |
+
value = cell.get_text(strip=True)
|
| 108 |
+
# Clean up the value
|
| 109 |
+
value = re.sub(r'\[\d+\]', '', value) # Remove references
|
| 110 |
+
data[key] = value
|
| 111 |
+
|
| 112 |
+
return data
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Infobox error: {e}")
|
| 115 |
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
def scrape_person(self, name: str) -> Optional[Dict]:
|
| 118 |
"""
|
requirements.txt
CHANGED
|
@@ -5,7 +5,6 @@ pydantic==2.5.2
|
|
| 5 |
pydantic-settings==2.1.0
|
| 6 |
requests==2.31.0
|
| 7 |
beautifulsoup4==4.12.2
|
| 8 |
-
lxml==4.9.3
|
| 9 |
httpx==0.25.2
|
| 10 |
python-multipart==0.0.6
|
| 11 |
aiohttp==3.9.1
|
|
|
|
| 5 |
pydantic-settings==2.1.0
|
| 6 |
requests==2.31.0
|
| 7 |
beautifulsoup4==4.12.2
|
|
|
|
| 8 |
httpx==0.25.2
|
| 9 |
python-multipart==0.0.6
|
| 10 |
aiohttp==3.9.1
|