Ines1994 commited on
Commit
b02a1b6
ยท
verified ยท
1 Parent(s): 5575c81

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +4 -0
  2. scraper_api.py +393 -0
app.py CHANGED
@@ -47,6 +47,10 @@ def render_sidebar():
47
  st.session_state.chat_messages = []
48
  st.session_state.last_sources = []
49
  st.rerun()
 
 
 
 
50
 
51
  # DB Status
52
  try:
 
47
  st.session_state.chat_messages = []
48
  st.session_state.last_sources = []
49
  st.rerun()
50
+
51
+ if st.button("๐Ÿ”„ ุชุญุฏูŠุซ ุงู„ุฐุงูƒุฑุฉ", use_container_width=True, help="ุงุณุชุฎุฏู… ู‡ุฐุง ุงู„ุฒุฑ ุจุนุฏ ุจู†ุงุก ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช"):
52
+ st.cache_resource.clear()
53
+ st.rerun()
54
 
55
  # DB Status
56
  try:
scraper_api.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ENA Chatbot โ€” Scraper v4.0 (API-Based)
3
+ ูŠุณุชุฎุฏู… WordPress REST API + RSS Feed + Sitemap ุจุฏู„ ุงู„ู€ scraping ุงู„ุชู‚ู„ูŠุฏูŠ
4
+ Run: python scraper_api.py
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import re
10
+ import time
11
+ import xml.etree.ElementTree as ET
12
+ from urllib.parse import urlparse, unquote
13
+ from html import unescape
14
+
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
+
18
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
19
+ # โš™๏ธ CONFIG
20
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
21
+
22
+ BASE = "https://www.ena.tn"
23
+
24
+ API_ENDPOINTS = {
25
+ "posts_ar": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=ar",
26
+ "posts_fr": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=fr",
27
+ "posts_all": f"{BASE}/wp-json/wp/v2/posts?per_page=100",
28
+ "pages_ar": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=ar",
29
+ "pages_fr": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=fr",
30
+ "pages_all": f"{BASE}/wp-json/wp/v2/pages?per_page=100",
31
+ }
32
+
33
+ RSS_FEEDS = [
34
+ f"{BASE}/feed/",
35
+ f"{BASE}/ar/feed/",
36
+ f"{BASE}/fr/feed/",
37
+ ]
38
+
39
+ SITEMAPS = [
40
+ f"{BASE}/ar/wp-sitemap-posts-post-1.xml",
41
+ f"{BASE}/ar/wp-sitemap-posts-page-1.xml",
42
+ f"{BASE}/fr/wp-sitemap-posts-post-1.xml",
43
+ f"{BASE}/fr/wp-sitemap-posts-page-1.xml",
44
+ ]
45
+
46
+ # ุตูุญุงุช ู…ู‡ู…ุฉ ู†ุฌูŠุจู‡ุง ู…ุจุงุดุฑุฉ ุจุงู„ู€ scraping (ู…ุง ุชุธู‡ุฑุด ููŠ ุงู„ู€ API)
47
+ PRIORITY_PAGES = [
48
+ f"{BASE}/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/",
49
+ f"{BASE}/ar/concours-ar/informations-generales-ar/",
50
+ f"{BASE}/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/",
51
+ f"{BASE}/ar/concours-ar/agents-categorie-a3-ar/",
52
+ f"{BASE}/ar/preparation-au-concours-ar/",
53
+ f"{BASE}/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/",
54
+ f"{BASE}/fr/concours/informations-generales/",
55
+ f"{BASE}/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/",
56
+ f"{BASE}/fr/concours/agents-de-la-sous-categorie-a3/",
57
+ f"{BASE}/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/",
58
+ f"{BASE}/fr/formation-continue/formation-continue-a-distance-et-presentielle/",
59
+ f"{BASE}/ar/inscription2026/",
60
+ f"{BASE}/ar/ouverturefad2026/",
61
+ f"{BASE}/ar/fad2026/",
62
+ ]
63
+
64
+ HEADERS = {
65
+ "User-Agent": "Mozilla/5.0 (compatible; ENA-Chatbot/4.0; +https://www.ena.tn)",
66
+ "Accept": "application/json, text/html",
67
+ }
68
+
69
+ CATS = {
70
+ "/concours/": "concours_fr",
71
+ "/concours-ar": "concours_ar",
72
+ "/ar/concours": "concours_ar",
73
+ "/formation/": "formation_fr",
74
+ "/ar/formation": "formation_ar",
75
+ "/formation-continue": "formation_continue",
76
+ "/gouvernance/": "gouvernance",
77
+ "/actualites/": "news_fr",
78
+ "/actualites-ar/": "news_ar",
79
+ "/leadership": "leadership",
80
+ "/inscription": "inscription",
81
+ "/fad": "fad",
82
+ }
83
+
84
+ SKIP_SLUGS = [
85
+ "page-dexemple", "sample-page", "politique-de-confidentialite",
86
+ "shop", "cart", "checkout", "my-account", "woocommerce",
87
+ "default-kit", "elementor", "log-file",
88
+ ]
89
+
90
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
91
+ # ๐Ÿ› ๏ธ HELPERS
92
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
93
+
94
+ def get_category(url: str) -> str:
95
+ ul = url.lower()
96
+ for p, c in CATS.items():
97
+ if p in ul:
98
+ return c
99
+ return "other"
100
+
101
+ def get_lang(url: str) -> str:
102
+ path = urlparse(url.lower()).path
103
+ if "/ar/" in path or path.startswith("/ar"):
104
+ return "ar"
105
+ return "fr"
106
+
107
+ def clean_html(html_text: str) -> str:
108
+ """ุฅุฒุงู„ุฉ HTML tags ูˆุชู†ุธูŠู ุงู„ู†ุต"""
109
+ if not html_text:
110
+ return ""
111
+ soup = BeautifulSoup(html_text, "html.parser")
112
+ text = soup.get_text(" ", strip=True)
113
+ text = unescape(text)
114
+ text = re.sub(r"\s{3,}", " ", text)
115
+ return text.strip()
116
+
117
+ def should_skip(slug: str, title: str) -> bool:
118
+ slug_lower = slug.lower()
119
+ title_lower = title.lower()
120
+ return any(s in slug_lower or s in title_lower for s in SKIP_SLUGS)
121
+
122
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
123
+ # ๐Ÿ“ก 1. WordPress REST API
124
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
125
+
126
+ def fetch_api(endpoint: str) -> list[dict]:
127
+ """ูŠุฌูŠุจ ุงู„ุจูŠุงู†ุงุช ู…ู† WordPress API"""
128
+ all_items = []
129
+ page = 1
130
+
131
+ while True:
132
+ url = f"{endpoint}&page={page}"
133
+ try:
134
+ r = requests.get(url, headers=HEADERS, timeout=20)
135
+ if r.status_code == 400: # No more pages
136
+ break
137
+ r.raise_for_status()
138
+ items = r.json()
139
+ if not items:
140
+ break
141
+ all_items.extend(items)
142
+ # ุฅุฐุง ุฃู‚ู„ ู…ู† 100 โ†’ ุขุฎุฑ ุตูุญุฉ
143
+ if len(items) < 100:
144
+ break
145
+ page += 1
146
+ time.sleep(0.5) # respectful delay
147
+ except Exception as e:
148
+ print(f" API error {url[:60]}: {e}")
149
+ break
150
+
151
+ return all_items
152
+
153
+ def process_api_items(items: list[dict], content_type: str) -> list[dict]:
154
+ """ุชุญูˆูŠู„ API items ู„ุตูŠุบุฉ ู…ูˆุญู‘ุฏุฉ"""
155
+ results = []
156
+ for item in items:
157
+ slug = item.get("slug", "")
158
+ title_raw = item.get("title", {}).get("rendered", "")
159
+ title = clean_html(title_raw)
160
+ content_raw = item.get("content", {}).get("rendered", "")
161
+ excerpt_raw = item.get("excerpt", {}).get("rendered", "")
162
+ link = item.get("link", "")
163
+ date = item.get("date", "")[:10] # YYYY-MM-DD
164
+
165
+ if should_skip(slug, title):
166
+ continue
167
+
168
+ # ู†ุฌู…ุน ุงู„ู…ุญุชูˆู‰ ุงู„ูƒุงู…ู„
169
+ content = clean_html(content_raw)
170
+ if not content or len(content) < 50:
171
+ content = clean_html(excerpt_raw)
172
+ if not content or len(content) < 50:
173
+ continue
174
+
175
+ # ู†ุถูŠู ุงู„ุนู†ูˆุงู† ููŠ ุจุฏุงูŠุฉ ุงู„ู…ุญุชูˆู‰
176
+ full_content = f"{title}\n\n{content}" if title else content
177
+
178
+ results.append({
179
+ "page_name": unquote(slug),
180
+ "url": link,
181
+ "source": "ena.tn-api",
182
+ "langue": get_lang(link),
183
+ "category": get_category(link),
184
+ "content_type": content_type,
185
+ "date": date,
186
+ "content": full_content,
187
+ "chars": len(full_content),
188
+ })
189
+
190
+ return results
191
+
192
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
193
+ # ๐Ÿ“ฐ 2. RSS Feed
194
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
195
+
196
+ def fetch_rss(feed_url: str) -> list[dict]:
197
+ """ูŠุฌูŠุจ ุขุฎุฑ ุงู„ุฃุฎุจุงุฑ ู…ู† RSS"""
198
+ results = []
199
+ try:
200
+ r = requests.get(feed_url, headers=HEADERS, timeout=15)
201
+ r.raise_for_status()
202
+ # Clean potential weird characters at start
203
+ content = r.content.strip()
204
+ root = ET.fromstring(content)
205
+
206
+ # RSS namespace
207
+ ns = {"content": "http://purl.org/rss/1.0/modules/content/"}
208
+
209
+ for item in root.findall(".//item"):
210
+ title = item.findtext("title", "").strip()
211
+ link = item.findtext("link", "").strip()
212
+ desc = item.findtext("description", "")
213
+ date = item.findtext("pubDate", "")[:16]
214
+
215
+ # ู…ุญุชูˆู‰ ูƒุงู…ู„ ุฅุฐุง ู…ุชูˆูุฑ
216
+ content_encoded = item.find("content:encoded", ns)
217
+ if content_encoded is not None and content_encoded.text:
218
+ content = clean_html(content_encoded.text)
219
+ else:
220
+ content = clean_html(desc)
221
+
222
+ if not content or len(content) < 50:
223
+ continue
224
+
225
+ full_content = f"{title}\n\n{content}" if title else content
226
+ slug = urlparse(link).path.strip("/").split("/")[-1]
227
+
228
+ results.append({
229
+ "page_name": unquote(slug),
230
+ "url": link,
231
+ "source": "ena.tn-rss",
232
+ "langue": get_lang(link),
233
+ "category": "news_ar" if "/ar/" in link else "news_fr",
234
+ "content_type": "news",
235
+ "date": date,
236
+ "content": full_content,
237
+ "chars": len(full_content),
238
+ })
239
+
240
+ except Exception as e:
241
+ print(f" RSS error {feed_url}: {e}")
242
+
243
+ return results
244
+
245
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
246
+ # ๐Ÿ—บ๏ธ 3. Sitemap โ†’ Scrape important pages
247
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
248
+
249
+ def fetch_sitemap_urls(sitemap_url: str) -> list[str]:
250
+ """ูŠุฌูŠุจ ูƒู„ URLs ู…ู† ุงู„ู€ sitemap"""
251
+ urls = []
252
+ try:
253
+ r = requests.get(sitemap_url, headers=HEADERS, timeout=15)
254
+ r.raise_for_status()
255
+ root = ET.fromstring(r.content)
256
+ ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
257
+ for loc in root.findall(".//sm:loc", ns):
258
+ if loc.text:
259
+ urls.append(loc.text.strip())
260
+ except Exception as e:
261
+ print(f" Sitemap error {sitemap_url}: {e}")
262
+ return urls
263
+
264
+ def scrape_page(url: str) -> dict | None:
265
+ """ูŠุฌูŠุจ ู…ุญุชูˆู‰ ุตูุญุฉ ูˆุงุญุฏุฉ ุจุงู„ู€ scraping"""
266
+ try:
267
+ r = requests.get(url, headers=HEADERS, timeout=20, allow_redirects=True)
268
+ r.raise_for_status()
269
+ soup = BeautifulSoup(r.text, "html.parser")
270
+
271
+ # ุฅุฒุงู„ุฉ ุงู„ุนู†ุงุตุฑ ุบูŠุฑ ุงู„ู…ููŠุฏุฉ
272
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
273
+ tag.decompose()
274
+ for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar|widget", re.I)):
275
+ tag.decompose()
276
+
277
+ text = soup.get_text(" ", strip=True)
278
+ text = re.sub(r"\s{3,}", " ", text)
279
+
280
+ if len(text) < 100:
281
+ return None
282
+
283
+ slug = urlparse(url).path.strip("/").split("/")[-1]
284
+ return {
285
+ "page_name": unquote(slug),
286
+ "url": url,
287
+ "source": "ena.tn-scrape",
288
+ "langue": get_lang(url),
289
+ "category": get_category(url),
290
+ "content_type": "page",
291
+ "date": "",
292
+ "content": text,
293
+ "chars": len(text),
294
+ }
295
+ except Exception as e:
296
+ print(f" skip {url[:60]}: {e}")
297
+ return None
298
+
299
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
300
+ # ๐Ÿš€ MAIN
301
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
302
+
303
+ if __name__ == "__main__":
304
+ print("=" * 60)
305
+ print("Step: ENA Scraper v4.0 -- API + RSS + Sitemap")
306
+ print("=" * 60)
307
+
308
+ all_data: list[dict] = []
309
+ seen_urls: set[str] = set()
310
+ seen_texts: set[str] = set()
311
+
312
+ def add_unique(items: list[dict]):
313
+ for item in items:
314
+ url = item.get("url", "")
315
+ text = item.get("content", "")
316
+ if url not in seen_urls and text not in seen_texts and len(text) > 50:
317
+ all_data.append(item)
318
+ seen_urls.add(url)
319
+ seen_texts.add(text)
320
+
321
+ # โ”€โ”€ 1. WordPress API โ”€โ”€
322
+ print("\nStep 1: WordPress REST API...")
323
+ for name, endpoint in API_ENDPOINTS.items():
324
+ print(f" Fetching {name}...")
325
+ items = fetch_api(endpoint)
326
+ processed = process_api_items(items, "post" if "posts" in name else "page")
327
+ add_unique(processed)
328
+ print(f" OK: {len(processed)} items from {name}")
329
+
330
+ # โ”€โ”€ 2. RSS Feed โ”€โ”€
331
+ print("\nStep 2: RSS Feeds...")
332
+ for feed_url in RSS_FEEDS:
333
+ print(f" Fetching {feed_url}...")
334
+ items = fetch_rss(feed_url)
335
+ add_unique(items)
336
+ print(f" OK: {len(items)} items from RSS")
337
+
338
+ # โ”€โ”€ 3. Sitemap URLs โ”€โ”€
339
+ print("\nStep 3: Sitemap pages...")
340
+ sitemap_urls = []
341
+ for sm in SITEMAPS:
342
+ urls = fetch_sitemap_urls(sm)
343
+ sitemap_urls.extend(urls)
344
+ print(f" Found {len(urls)} URLs in {sm.split('/')[-1]}")
345
+
346
+ # Scrape sitemap pages not already fetched
347
+ new_urls = [u for u in sitemap_urls if u not in seen_urls]
348
+ print(f" Scraping {len(new_urls)} new pages from sitemap...")
349
+ for i, url in enumerate(new_urls):
350
+ page = scrape_page(url)
351
+ if page:
352
+ add_unique([page])
353
+ if (i + 1) % 20 == 0:
354
+ print(f" {i + 1}/{len(new_urls)} scraped...")
355
+ time.sleep(0.3)
356
+
357
+ # โ”€โ”€ 4. Priority Pages โ”€โ”€
358
+ print("\nStep 4: Priority pages (concours, conditions)...")
359
+ priority_new = [u for u in PRIORITY_PAGES if u not in seen_urls]
360
+ for url in priority_new:
361
+ page = scrape_page(url)
362
+ if page:
363
+ add_unique([page])
364
+ print(f" OK: {page['page_name']}")
365
+ time.sleep(0.3)
366
+
367
+ # โ”€โ”€ Stats โ”€โ”€
368
+ print("\n" + "=" * 60)
369
+ print(f"OK. Total pages: {len(all_data)}")
370
+ print(f"Total characters: {sum(p['chars'] for p in all_data):,}")
371
+
372
+ from collections import Counter
373
+ cats = Counter(p["category"] for p in all_data)
374
+ langs = Counter(p["langue"] for p in all_data)
375
+ srcs = Counter(p["source"] for p in all_data)
376
+
377
+ print("\nBy category:")
378
+ for cat, count in cats.most_common():
379
+ print(f" {cat}: {count}")
380
+ print("\nBy language:")
381
+ for lang, count in langs.items():
382
+ print(f" {lang}: {count}")
383
+ print("\nBy source:")
384
+ for src, count in srcs.items():
385
+ print(f" {src}: {count}")
386
+
387
+ # โ”€โ”€ Save โ”€โ”€
388
+ with open("ena_full_data.json", "w", encoding="utf-8") as f:
389
+ json.dump(all_data, f, ensure_ascii=False, indent=2)
390
+
391
+ print("\nSaved to ena_full_data.json")
392
+ print("=" * 60)
393
+ print("Done! Now run: python build_chroma.py")