Madras1 commited on
Commit
eba56e9
verified
1 Parent(s): 4c2075e

Upload 42 files

Browse files
Files changed (1) hide show
  1. app/services/ingestion/wikipedia.py +56 -36
app/services/ingestion/wikipedia.py CHANGED
@@ -13,6 +13,11 @@ class WikipediaScraper:
13
  BASE_URL = "https://pt.wikipedia.org"
14
  API_URL = "https://pt.wikipedia.org/w/api.php"
15
 
 
 
 
 
 
16
  def search(self, query: str, limit: int = 10) -> List[Dict]:
17
  """
18
  Busca artigos na Wikipedia
@@ -26,7 +31,12 @@ class WikipediaScraper:
26
  "format": "json"
27
  }
28
 
29
- response = requests.get(self.API_URL, params=params, timeout=10)
 
 
 
 
 
30
  response.raise_for_status()
31
  data = response.json()
32
 
@@ -47,43 +57,53 @@ class WikipediaScraper:
47
  """
48
  Busca informa莽玫es completas de um artigo
49
  """
50
- params = {
51
- "action": "query",
52
- "titles": title,
53
- "prop": "extracts|pageimages|coordinates|categories",
54
- "exintro": True,
55
- "explaintext": True,
56
- "pithumbsize": 300,
57
- "format": "json"
58
- }
59
-
60
- response = requests.get(self.API_URL, params=params)
61
- data = response.json()
62
-
63
- pages = data.get("query", {}).get("pages", {})
64
- for page_id, page in pages.items():
65
- if page_id == "-1":
66
- return None
67
-
68
- result = {
69
- "title": page.get("title"),
70
- "extract": page.get("extract"),
71
- "pageid": page.get("pageid"),
72
- "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}",
73
- "thumbnail": page.get("thumbnail", {}).get("source"),
74
- "categories": [c["title"].replace("Categoria:", "")
75
- for c in page.get("categories", [])]
76
  }
77
 
78
- # Coordenadas se dispon铆veis
79
- if "coordinates" in page:
80
- coords = page["coordinates"][0]
81
- result["latitude"] = coords.get("lat")
82
- result["longitude"] = coords.get("lon")
 
 
 
83
 
84
- return result
85
-
86
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def get_infobox(self, title: str) -> Dict:
89
  """
@@ -91,7 +111,7 @@ class WikipediaScraper:
91
  """
92
  try:
93
  url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
94
- response = requests.get(url, timeout=10)
95
  soup = BeautifulSoup(response.text, "html.parser")
96
 
97
  infobox = soup.find("table", class_="infobox")
 
13
  BASE_URL = "https://pt.wikipedia.org"
14
  API_URL = "https://pt.wikipedia.org/w/api.php"
15
 
16
+ # User-Agent obrigat贸rio para API da Wikipedia
17
+ HEADERS = {
18
+ "User-Agent": "NumidiumBot/1.0 (https://github.com/numidium; contact@numidium.app) Python/3.11"
19
+ }
20
+
21
  def search(self, query: str, limit: int = 10) -> List[Dict]:
22
  """
23
  Busca artigos na Wikipedia
 
31
  "format": "json"
32
  }
33
 
34
+ response = requests.get(
35
+ self.API_URL,
36
+ params=params,
37
+ headers=self.HEADERS,
38
+ timeout=10
39
+ )
40
  response.raise_for_status()
41
  data = response.json()
42
 
 
57
  """
58
  Busca informa莽玫es completas de um artigo
59
  """
60
+ try:
61
+ params = {
62
+ "action": "query",
63
+ "titles": title,
64
+ "prop": "extracts|pageimages|coordinates|categories",
65
+ "exintro": True,
66
+ "explaintext": True,
67
+ "pithumbsize": 300,
68
+ "format": "json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
 
71
+ response = requests.get(
72
+ self.API_URL,
73
+ params=params,
74
+ headers=self.HEADERS,
75
+ timeout=10
76
+ )
77
+ response.raise_for_status()
78
+ data = response.json()
79
 
80
+ pages = data.get("query", {}).get("pages", {})
81
+ for page_id, page in pages.items():
82
+ if page_id == "-1":
83
+ return None
84
+
85
+ result = {
86
+ "title": page.get("title"),
87
+ "extract": page.get("extract"),
88
+ "pageid": page.get("pageid"),
89
+ "url": f"{self.BASE_URL}/wiki/{page.get('title', '').replace(' ', '_')}",
90
+ "thumbnail": page.get("thumbnail", {}).get("source"),
91
+ "categories": [c["title"].replace("Categoria:", "")
92
+ for c in page.get("categories", [])]
93
+ }
94
+
95
+ # Coordenadas se dispon铆veis
96
+ if "coordinates" in page:
97
+ coords = page["coordinates"][0]
98
+ result["latitude"] = coords.get("lat")
99
+ result["longitude"] = coords.get("lon")
100
+
101
+ return result
102
+
103
+ return None
104
+ except Exception as e:
105
+ print(f"Wikipedia article error: {e}")
106
+ return None
107
 
108
  def get_infobox(self, title: str) -> Dict:
109
  """
 
111
  """
112
  try:
113
  url = f"{self.BASE_URL}/wiki/{title.replace(' ', '_')}"
114
+ response = requests.get(url, headers=self.HEADERS, timeout=10)
115
  soup = BeautifulSoup(response.text, "html.parser")
116
 
117
  infobox = soup.find("table", class_="infobox")