Madras1 commited on
Commit
4c2075e
·
verified ·
1 Parent(s): c4a572a

Upload 42 files

Browse files
app/services/ingestion/wikipedia.py CHANGED
@@ -17,26 +17,31 @@ class WikipediaScraper:
17
  """
18
  Busca artigos na Wikipedia
19
  """
20
- params = {
21
- "action": "query",
22
- "list": "search",
23
- "srsearch": query,
24
- "srlimit": limit,
25
- "format": "json"
26
- }
27
-
28
- response = requests.get(self.API_URL, params=params)
29
- data = response.json()
30
-
31
- results = []
32
- for item in data.get("query", {}).get("search", []):
33
- results.append({
34
- "title": item["title"],
35
- "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
36
- "pageid": item["pageid"]
37
- })
38
-
39
- return results
 
 
 
 
 
40
 
41
  def get_article(self, title: str) -> Optional[Dict]:
42
  """
@@ -84,26 +89,30 @@ class WikipediaScraper:
84
  """
85
  Tenta extrair dados estruturados do infobox de um artigo
86
  """
87
- url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
88
- response = requests.get(url)
89
- soup = BeautifulSoup(response.text, "lxml")
90
-
91
- infobox = soup.find("table", class_="infobox")
92
- if not infobox:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {}
94
-
95
- data = {}
96
- for row in infobox.find_all("tr"):
97
- header = row.find("th")
98
- cell = row.find("td")
99
- if header and cell:
100
- key = header.get_text(strip=True)
101
- value = cell.get_text(strip=True)
102
- # Clean up the value
103
- value = re.sub(r'\[\d+\]', '', value) # Remove references
104
- data[key] = value
105
-
106
- return data
107
 
108
  def scrape_person(self, name: str) -> Optional[Dict]:
109
  """
 
17
  """
18
  Busca artigos na Wikipedia
19
  """
20
+ try:
21
+ params = {
22
+ "action": "query",
23
+ "list": "search",
24
+ "srsearch": query,
25
+ "srlimit": limit,
26
+ "format": "json"
27
+ }
28
+
29
+ response = requests.get(self.API_URL, params=params, timeout=10)
30
+ response.raise_for_status()
31
+ data = response.json()
32
+
33
+ results = []
34
+ for item in data.get("query", {}).get("search", []):
35
+ results.append({
36
+ "title": item["title"],
37
+ "snippet": BeautifulSoup(item["snippet"], "html.parser").get_text(),
38
+ "pageid": item["pageid"]
39
+ })
40
+
41
+ return results
42
+ except Exception as e:
43
+ print(f"Wikipedia search error: {e}")
44
+ return []
45
 
46
  def get_article(self, title: str) -> Optional[Dict]:
47
  """
 
89
  """
90
  Tenta extrair dados estruturados do infobox de um artigo
91
  """
92
+ try:
93
+ url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
94
+ response = requests.get(url, timeout=10)
95
+ soup = BeautifulSoup(response.text, "html.parser")
96
+
97
+ infobox = soup.find("table", class_="infobox")
98
+ if not infobox:
99
+ return {}
100
+
101
+ data = {}
102
+ for row in infobox.find_all("tr"):
103
+ header = row.find("th")
104
+ cell = row.find("td")
105
+ if header and cell:
106
+ key = header.get_text(strip=True)
107
+ value = cell.get_text(strip=True)
108
+ # Clean up the value
109
+ value = re.sub(r'\[\d+\]', '', value) # Remove references
110
+ data[key] = value
111
+
112
+ return data
113
+ except Exception as e:
114
+ print(f"Infobox error: {e}")
115
  return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  def scrape_person(self, name: str) -> Optional[Dict]:
118
  """
requirements.txt CHANGED
@@ -5,7 +5,6 @@ pydantic==2.5.2
5
  pydantic-settings==2.1.0
6
  requests==2.31.0
7
  beautifulsoup4==4.12.2
8
- lxml==4.9.3
9
  httpx==0.25.2
10
  python-multipart==0.0.6
11
  aiohttp==3.9.1
 
5
  pydantic-settings==2.1.0
6
  requests==2.31.0
7
  beautifulsoup4==4.12.2
 
8
  httpx==0.25.2
9
  python-multipart==0.0.6
10
  aiohttp==3.9.1