Ines1994 commited on
Commit
08db80a
ยท
verified ยท
1 Parent(s): 15242ce

Upload 16 files

Browse files
Files changed (3) hide show
  1. build_chroma.py +6 -6
  2. ena_full_data.json +0 -0
  3. scraper.py +167 -55
build_chroma.py CHANGED
@@ -13,13 +13,13 @@ load_dotenv()
13
  def build():
14
  # Load Data
15
  if not os.path.exists("ena_full_data.json"):
16
- print("โŒ Error: ena_full_data.json not found!")
17
  return
18
 
19
  with open("ena_full_data.json", "r", encoding="utf-8") as f:
20
  pages = json.load(f)
21
 
22
- print(f"๐Ÿ“„ Loaded {len(pages)} pages.")
23
 
24
  # Intelligent Chunking
25
  # We use specific separators to avoid breaking administrative lists (numbered items)
@@ -43,7 +43,7 @@ def build():
43
  "content": chunk
44
  })
45
 
46
- print(f"โœ… Created {len(all_chunks)} chunks.")
47
 
48
  # Embeddings
49
  embeddings = HuggingFaceEmbeddings(
@@ -56,7 +56,7 @@ def build():
56
  client = chromadb.PersistentClient(path=CHROMA_PATH)
57
  try:
58
  client.delete_collection(COLLECTION_NAME)
59
- print("๐Ÿ—‘๏ธ Old collection deleted.")
60
  except:
61
  pass
62
 
@@ -79,9 +79,9 @@ def build():
79
  "category": c["category"]
80
  } for c in batch]
81
  )
82
- print(f"๐Ÿ“ฅ Inserted {min(i+BATCH_SIZE, len(all_chunks))}/{len(all_chunks)}")
83
 
84
- print(f"๐Ÿš€ Success! Total documents: {vector_store._collection.count()}")
85
 
86
  if __name__ == "__main__":
87
  build()
 
13
  def build():
14
  # Load Data
15
  if not os.path.exists("ena_full_data.json"):
16
+ print("Error: ena_full_data.json not found!")
17
  return
18
 
19
  with open("ena_full_data.json", "r", encoding="utf-8") as f:
20
  pages = json.load(f)
21
 
22
+ print(f"Loaded {len(pages)} pages.")
23
 
24
  # Intelligent Chunking
25
  # We use specific separators to avoid breaking administrative lists (numbered items)
 
43
  "content": chunk
44
  })
45
 
46
+ print(f"Created {len(all_chunks)} chunks.")
47
 
48
  # Embeddings
49
  embeddings = HuggingFaceEmbeddings(
 
56
  client = chromadb.PersistentClient(path=CHROMA_PATH)
57
  try:
58
  client.delete_collection(COLLECTION_NAME)
59
+ print("Old collection deleted.")
60
  except:
61
  pass
62
 
 
79
  "category": c["category"]
80
  } for c in batch]
81
  )
82
+ print(f"Inserted {min(i+BATCH_SIZE, len(all_chunks))}/{len(all_chunks)}")
83
 
84
+ print(f"Success! Total documents: {vector_store._collection.count()}")
85
 
86
  if __name__ == "__main__":
87
  build()
ena_full_data.json CHANGED
The diff for this file is too large to render. See raw diff
 
scraper.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
- Crawl www.ena.tn (ar + fr seeds) and save structured text to ena_full_data.json.
3
- Run: pip install requests beautifulsoup4 && python scraper.py
 
4
  """
5
  from __future__ import annotations
6
 
@@ -8,12 +9,45 @@ import json
8
  import re
9
  from collections import deque
10
  from typing import Optional
11
- from urllib.parse import urljoin, urlparse
12
 
13
  import requests
14
  from bs4 import BeautifulSoup
15
 
16
- BASE_URLS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "https://www.ena.tn/ar/",
18
  "https://www.ena.tn/fr/",
19
  ]
@@ -24,15 +58,36 @@ HEADERS = {
24
  }
25
 
26
  CATS = {
27
- "/concours/": "concours_fr",
28
- "/formation/": "formation_fr",
29
- "/gouvernance/": "gouvernance",
30
- "/ar/concours": "concours_ar",
31
- "/ar/formation": "formation_ar",
32
- "/ar/service": "service_ar",
33
- "/actualites/": "news",
 
 
 
 
 
 
 
 
 
34
  }
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def _get_cat(u: str) -> str:
38
  ul = u.lower()
@@ -54,32 +109,60 @@ def normalize_url(url: str) -> Optional[str]:
54
  host = p.netloc.lower()
55
  if "ena.tn" not in host:
56
  return None
 
 
 
57
  path = p.path or "/"
58
  query = f"?{p.query}" if p.query else ""
59
  return f"https://{host}{path}{query}"
60
 
61
 
62
- def _should_skip_href(url: str) -> bool:
63
- low = url.lower().split("?")[0]
64
- return bool(
65
- re.search(r"\.(pdf|jpg|jpeg|png|gif|zip|css|js|ico|svg|woff2?)(\?|$)", low)
66
- )
 
 
 
 
67
 
68
 
69
  def page_lang(url: str) -> str:
70
- u = url.lower()
71
- if "/ar/" in u or "/ar" in urlparse(u).path[:5]:
72
  return "ar"
73
- if "/fr/" in u or "/fr" in urlparse(u).path[:5]:
74
  return "fr"
75
  return "fr"
76
 
77
 
78
- def extract_text(soup: BeautifulSoup) -> str:
79
- for t in soup(["script", "style", "nav", "footer", "header"]):
80
- t.decompose()
81
- return soup.get_text(" ", strip=True)
 
 
 
 
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
85
  all_data: list[dict] = []
@@ -88,48 +171,52 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
88
 
89
  for base in base_list:
90
  nu = normalize_url(base)
91
- if nu:
92
  queue.append((nu, 0))
93
 
 
 
94
  while queue:
95
  url, depth = queue.popleft()
96
- if url in visited:
97
- continue
98
- if depth > max_depth:
99
  continue
100
  visited.add(url)
101
 
102
  try:
103
- r = requests.get(url, headers=HEADERS, timeout=25)
104
  r.raise_for_status()
105
  except (requests.RequestException, OSError) as e:
106
- print(f"skip {url}: {e}")
107
  continue
108
 
109
  ctype = (r.headers.get("Content-Type") or "").lower()
110
- if "html" not in ctype and "xml" not in ctype:
111
  continue
112
 
113
  soup = BeautifulSoup(r.text, "html.parser")
114
  text = extract_text(soup)
115
- if len(text) < 50:
116
  continue
117
 
118
- path = urlparse(url).path.strip("/")
119
- page_name = path.split("/")[-1] if path else "home"
120
-
121
- all_data.append(
122
- {
123
- "page_name": page_name,
124
- "url": url,
125
- "source": "ena.tn",
126
- "langue": page_lang(url),
127
- "category": _get_cat(url),
128
- "content": text,
129
- "chars": len(text),
130
- }
131
- )
132
-
 
 
 
 
133
  if depth < max_depth:
134
  for a in soup.find_all("a", href=True):
135
  href = (a.get("href") or "").strip()
@@ -137,29 +224,54 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
137
  continue
138
  next_u = urljoin(url, href)
139
  nu = normalize_url(next_u)
140
- if not nu or _should_skip_href(nu):
141
- continue
142
- if nu not in visited:
143
  queue.append((nu, depth + 1))
144
 
145
  return all_data
146
 
147
 
 
 
 
 
148
  if __name__ == "__main__":
149
- print("Crawling ENA website (ar + fr seeds)...\n")
 
 
 
150
 
151
  all_data = crawl(BASE_URLS, max_depth=3)
152
 
 
153
  unique: list[dict] = []
154
  seen_texts: set[str] = set()
 
155
  for page in all_data:
156
- if page["content"] not in seen_texts:
 
157
  unique.append(page)
158
  seen_texts.add(page["content"])
159
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  with open("ena_full_data.json", "w", encoding="utf-8") as f:
161
  json.dump(unique, f, ensure_ascii=False, indent=2)
162
 
163
- print(f"\nPages: {len(unique)}")
164
- print(f"Characters: {sum(p['chars'] for p in unique):,}")
165
- print("Saved to ena_full_data.json")
 
1
  """
2
+ ENA Chatbot โ€” Scraper v3.0 Final
3
+ Crawl www.ena.tn (ar + fr) and save structured text to ena_full_data.json.
4
+ Run: python scraper.py
5
  """
6
  from __future__ import annotations
7
 
 
9
  import re
10
  from collections import deque
11
  from typing import Optional
12
+ from urllib.parse import urljoin, urlparse, unquote
13
 
14
  import requests
15
  from bs4 import BeautifulSoup
16
 
17
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
18
+ # โš™๏ธ CONFIG
19
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
20
+
21
+ # ุตูุญุงุช ู…ุถู…ูˆู†ุฉ ุงู„ุชุญู…ูŠู„ โ€” ุชุจุฏุฃ ู…ู†ู‡ุง ู‚ุจู„ ุงู„ู€ crawl ุงู„ุนุงู…
22
+ PRIORITY_URLS = [
23
+ # ุงู„ู…ู†ุงุธุฑุงุช โ€” ุนุฑุจูŠ
24
+ "https://www.ena.tn/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/",
25
+ "https://www.ena.tn/ar/concours-ar/informations-generales-ar/",
26
+ "https://www.ena.tn/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/",
27
+ "https://www.ena.tn/ar/concours-ar/agents-categorie-a3-ar/",
28
+ "https://www.ena.tn/ar/preparation-au-concours-ar/",
29
+ # ุงู„ู…ู†ุงุธุฑุงุช โ€” ูุฑู†ุณูŠ
30
+ "https://www.ena.tn/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/",
31
+ "https://www.ena.tn/fr/concours/informations-generales/",
32
+ "https://www.ena.tn/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/",
33
+ "https://www.ena.tn/fr/concours/agents-de-la-sous-categorie-a3/",
34
+ "https://www.ena.tn/fr/concours/cycle-superieur/preparation-au-concours/",
35
+ # ุงู„ุชูƒูˆูŠู† ุงู„ู…ุณุชู…ุฑ โ€” ุนุฑุจูŠ
36
+ "https://www.ena.tn/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/",
37
+ "https://www.ena.tn/ar/formation-continue-ar/developpement-de-competences-ar/",
38
+ # ุงู„ุชูƒูˆูŠู† ุงู„ู…ุณุชู…ุฑ โ€” ูุฑู†ุณูŠ
39
+ "https://www.ena.tn/fr/formation-continue/formation-continue-a-distance-et-presentielle/",
40
+ # ุงู„ู…ุณุชุฌุฏุงุช ูˆุงู„ุฃุฎุจุงุฑ
41
+ "https://www.ena.tn/ar/actualites-ar/",
42
+ "https://www.ena.tn/fr/actualites-fr/",
43
+ "https://www.ena.tn/ar/%d9%85%d8%b3%d8%aa%d8%ac%d8%af%d8%a7%d8%aa/",
44
+ # ุตูุญุงุช ู…ู‡ู…ุฉ 2026
45
+ "https://www.ena.tn/ar/inscription2026/",
46
+ "https://www.ena.tn/ar/ouverturefad2026/",
47
+ "https://www.ena.tn/ar/fad2026/",
48
+ ]
49
+
50
+ BASE_URLS = PRIORITY_URLS + [
51
  "https://www.ena.tn/ar/",
52
  "https://www.ena.tn/fr/",
53
  ]
 
58
  }
59
 
60
  CATS = {
61
+ "/concours/": "concours_fr",
62
+ "/concours-ar": "concours_ar",
63
+ "/ar/concours": "concours_ar",
64
+ "/formation/": "formation_fr",
65
+ "/ar/formation": "formation_ar",
66
+ "/formation-continue":"formation_continue",
67
+ "/gouvernance/": "gouvernance",
68
+ "/ar/service": "service_ar",
69
+ "/actualites/": "news_fr",
70
+ "/actualites-fr/": "news_fr",
71
+ "/actualites-ar/": "news_ar",
72
+ "/evenement": "news_fr",
73
+ "/evenement-ar": "news_ar",
74
+ "/leadership": "leadership",
75
+ "/inscription": "inscription",
76
+ "/fad": "fad",
77
  }
78
 
79
+ # ุตูุญุงุช ู†ุชุฌุงู‡ู„ู‡ุง โ€” ู…ุง ุนู†ุฏู‡ุงุด ู…ุญุชูˆู‰ ู…ููŠุฏ
80
+ SKIP_PATTERNS = [
81
+ "wp-admin", "wp-login", "wp-json", "xmlrpc",
82
+ "woocommerce", "cart", "checkout", "my-account",
83
+ "politique-de-confidentialite", "page-d-exemple",
84
+ "elementor", "gravatar", "automattic",
85
+ "log_file", "attachment",
86
+ ]
87
+
88
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
89
+ # ๐Ÿ› ๏ธ HELPERS
90
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
91
 
92
  def _get_cat(u: str) -> str:
93
  ul = u.lower()
 
109
  host = p.netloc.lower()
110
  if "ena.tn" not in host:
111
  return None
112
+ # ุชุฌุงู‡ู„ ุฑูˆุงุจุท IP ุงู„ุฏุงุฎู„ูŠุฉ
113
+ if host.startswith("193."):
114
+ return None
115
  path = p.path or "/"
116
  query = f"?{p.query}" if p.query else ""
117
  return f"https://{host}{path}{query}"
118
 
119
 
120
+ def _should_skip(url: str) -> bool:
121
+ low = url.lower()
122
+ # ุชุฌุงู‡ู„ ุงู„ู…ู„ูุงุช
123
+ if re.search(r"\.(pdf|jpg|jpeg|png|gif|zip|css|js|ico|svg|woff2?|txt|mp4|mp3)(\?|$)", low.split("?")[0]):
124
+ return True
125
+ # ุชุฌุงู‡ู„ ุงู„ุตูุญุงุช ุบูŠุฑ ุงู„ู…ููŠุฏุฉ
126
+ if any(p in low for p in SKIP_PATTERNS):
127
+ return True
128
+ return False
129
 
130
 
131
  def page_lang(url: str) -> str:
132
+ path = urlparse(url.lower()).path
133
+ if "/ar/" in path or path.startswith("/ar"):
134
  return "ar"
135
+ if "/fr/" in path or path.startswith("/fr"):
136
  return "fr"
137
  return "fr"
138
 
139
 
140
+ def get_page_name(url: str) -> str:
141
+ """ุงุณุชุฎุฑุงุฌ ุงุณู… ุงู„ุตูุญุฉ ู…ุน ููƒ ุชุดููŠุฑ ุงู„ุนุฑุจูŠุฉ"""
142
+ path = urlparse(url).path.strip("/")
143
+ raw_name = path.split("/")[-1] if path else "home"
144
+ try:
145
+ return unquote(raw_name)
146
+ except Exception:
147
+ return raw_name
148
+
149
 
150
+ def extract_text(soup: BeautifulSoup) -> str:
151
+ """ุงุณุชุฎุฑุงุฌ ุงู„ู†ุต ุงู„ุตุงููŠ ุจุฏูˆู† navigation ูˆscripts"""
152
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
153
+ tag.decompose()
154
+ # ุฅุฒุงู„ุฉ Breadcrumbs
155
+ for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar", re.I)):
156
+ tag.decompose()
157
+ text = soup.get_text(" ", strip=True)
158
+ # ุชู†ุธูŠู ุงู„ู…ุณุงูุงุช ุงู„ุฒุงุฆุฏุฉ
159
+ text = re.sub(r"\s{3,}", " ", text)
160
+ return text
161
+
162
+
163
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
164
+ # ๐Ÿ•ท๏ธ CRAWLER
165
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
166
 
167
  def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
168
  all_data: list[dict] = []
 
171
 
172
  for base in base_list:
173
  nu = normalize_url(base)
174
+ if nu and not _should_skip(nu):
175
  queue.append((nu, 0))
176
 
177
+ total_fetched = 0
178
+
179
  while queue:
180
  url, depth = queue.popleft()
181
+ if url in visited or depth > max_depth:
 
 
182
  continue
183
  visited.add(url)
184
 
185
  try:
186
+ r = requests.get(url, headers=HEADERS, timeout=25, allow_redirects=True)
187
  r.raise_for_status()
188
  except (requests.RequestException, OSError) as e:
189
+ print(f" skip {url[:60]}: {e}")
190
  continue
191
 
192
  ctype = (r.headers.get("Content-Type") or "").lower()
193
+ if "html" not in ctype:
194
  continue
195
 
196
  soup = BeautifulSoup(r.text, "html.parser")
197
  text = extract_text(soup)
198
+ if len(text) < 100:
199
  continue
200
 
201
+ page_name = get_page_name(url)
202
+ category = _get_cat(url)
203
+ lang = page_lang(url)
204
+
205
+ all_data.append({
206
+ "page_name": page_name,
207
+ "url": url,
208
+ "source": "ena.tn",
209
+ "langue": lang,
210
+ "category": category,
211
+ "content": text,
212
+ "chars": len(text),
213
+ })
214
+
215
+ total_fetched += 1
216
+ if total_fetched % 20 == 0:
217
+ print(f" {total_fetched} pages fetched...")
218
+
219
+ # ุชุงุจุน ุงู„ู€ links ุฅุฐุง ู…ุง ูˆุตู„ู†ุงุด ู„ู„ุนู…ู‚ ุงู„ุฃู‚ุตู‰
220
  if depth < max_depth:
221
  for a in soup.find_all("a", href=True):
222
  href = (a.get("href") or "").strip()
 
224
  continue
225
  next_u = urljoin(url, href)
226
  nu = normalize_url(next_u)
227
+ if nu and not _should_skip(nu) and nu not in visited:
 
 
228
  queue.append((nu, depth + 1))
229
 
230
  return all_data
231
 
232
 
233
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
234
+ # ๐Ÿš€ MAIN
235
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
236
+
237
  if __name__ == "__main__":
238
+ print("=" * 60)
239
+ print("ENA Scraper v3.0 -- Starting crawl...")
240
+ print(f" Priority URLs: {len(PRIORITY_URLS)}")
241
+ print("=" * 60)
242
 
243
  all_data = crawl(BASE_URLS, max_depth=3)
244
 
245
+ # ุฅุฒุงู„ุฉ ุงู„ู…ูƒุฑุฑุงุช
246
  unique: list[dict] = []
247
  seen_texts: set[str] = set()
248
+ seen_urls: set[str] = set()
249
  for page in all_data:
250
+ # ุชุฌู†ุจ ุชูƒุฑุงุฑ ู†ูุณ ุงู„ู†ุต ุฃูˆ ู†ูุณ ุงู„ู€ URL
251
+ if page["content"] not in seen_texts and page["url"] not in seen_urls:
252
  unique.append(page)
253
  seen_texts.add(page["content"])
254
+ seen_urls.add(page["url"])
255
+
256
+ # ุฅุญุตุงุฆูŠุงุช
257
+ print("\n" + "=" * 60)
258
+ print(f"OK. Pages collected: {len(unique)}")
259
+ print(f"Total characters: {sum(p['chars'] for p in unique):,}")
260
+
261
+ from collections import Counter
262
+ cats = Counter(p["category"] for p in unique)
263
+ langs = Counter(p["langue"] for p in unique)
264
+ print("\nBy category:")
265
+ for cat, count in cats.most_common():
266
+ print(f" {cat}: {count}")
267
+ print("\nBy language:")
268
+ for lang, count in langs.items():
269
+ print(f" {lang}: {count}")
270
+
271
+ # ุญูุธ
272
  with open("ena_full_data.json", "w", encoding="utf-8") as f:
273
  json.dump(unique, f, ensure_ascii=False, indent=2)
274
 
275
+ print("\nSaved to ena_full_data.json")
276
+ print("=" * 60)
277
+ print("Done! Now run: python build_chroma.py")