Ines1994 commited on
Commit
8dcf3a7
ยท
verified ยท
1 Parent(s): 70a9baf

Upload 3 files

Browse files
Files changed (3) hide show
  1. build_chroma.py +3 -4
  2. ena_full_data.json +0 -0
  3. scraper_api.py +20 -2
build_chroma.py CHANGED
@@ -21,11 +21,10 @@ def build():
21
 
22
  print(f"Loaded {len(pages)} pages.")
23
 
24
- # Intelligent Chunking
25
- # We use specific separators to avoid breaking administrative lists (numbered items)
26
  text_splitter = RecursiveCharacterTextSplitter(
27
- chunk_size=500,
28
- chunk_overlap=100,
29
  separators=["\n\n", "\n", " - ", " * ", ". ", " "]
30
  )
31
 
 
21
 
22
  print(f"Loaded {len(pages)} pages.")
23
 
24
+ # Intelligent Chunking โ€” larger chunks to keep legal lists intact
 
25
  text_splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size=1000, # was 500 โ€” doubled to avoid splitting condition lists
27
+ chunk_overlap=200, # was 100 โ€” increased to preserve context between chunks
28
  separators=["\n\n", "\n", " - ", " * ", ". ", " "]
29
  )
30
 
ena_full_data.json CHANGED
The diff for this file is too large to render. See raw diff
 
scraper_api.py CHANGED
@@ -91,11 +91,29 @@ SKIP_SLUGS = [
91
  # ๐Ÿ› ๏ธ HELPERS
92
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
93
 
94
- def get_category(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
95
  ul = url.lower()
 
96
  for p, c in CATS.items():
97
  if p in ul:
98
  return c
 
 
 
 
 
 
99
  return "other"
100
 
101
  def get_lang(url: str) -> str:
@@ -180,7 +198,7 @@ def process_api_items(items: list[dict], content_type: str) -> list[dict]:
180
  "url": link,
181
  "source": "ena.tn-api",
182
  "langue": get_lang(link),
183
- "category": get_category(link),
184
  "content_type": content_type,
185
  "date": date,
186
  "content": full_content,
 
91
  # ๐Ÿ› ๏ธ HELPERS
92
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
93
 
94
+ CONTENT_CATS = {
95
+ "concours_ar": ["ู…ู†ุงุธุฑุฉ", "ุชุฑุดุญ", "ุดุฑูˆุท ุงู„ุฏุฎูˆู„", "ุจู‚ุงุน", "ุงุฎุชุจุงุฑ", "ู…ุฑุญู„ุฉ ุนู„ูŠุง", "ู…ุฑุญู„ุฉ ู…ุชูˆุณุทุฉ", "ุฃุนูˆุงู†"],
96
+ "concours_fr": ["concours", "candidature", "cycle supรฉrieur", "cycle moyen", "รฉpreuve", "places"],
97
+ "formation_continue": ["ุชูƒูˆูŠู† ู…ุณุชู…ุฑ", "formation continue", "fad", "ุชูƒูˆูŠู† ุนู† ุจุนุฏ"],
98
+ "formation_ar": ["ุชูƒูˆูŠู†", "ุจุฑู†ุงู…ุฌ", "ุชุฃู‡ูŠู„", "cycle de formation"],
99
+ "news_ar": ["ุฅุนู„ุงู†", "ุจู„ุงุบ", "ุฃุฎุจุงุฑ", "ู…ุณุชุฌุฏุงุช", "ุฅุนู„ุงู…"],
100
+ "news_fr": ["actualitรฉ", "communiquรฉ", "annonce", "information"],
101
+ "inscription": ["ุชุณุฌูŠู„", "inscription", "2026"],
102
+ "fad": ["fad", "ุชุนู„ูŠู… ุนู† ุจุนุฏ", "formation ร  distance"],
103
+ }
104
+
105
+ def get_category(url: str, text: str = "") -> str:
106
  ul = url.lower()
107
+ # First try URL-based matching (most reliable)
108
  for p, c in CATS.items():
109
  if p in ul:
110
  return c
111
+ # Then try content-based matching for API pages
112
+ if text:
113
+ tl = text.lower()
114
+ for cat, keywords in CONTENT_CATS.items():
115
+ if any(kw in tl for kw in keywords):
116
+ return cat
117
  return "other"
118
 
119
  def get_lang(url: str) -> str:
 
198
  "url": link,
199
  "source": "ena.tn-api",
200
  "langue": get_lang(link),
201
+ "category": get_category(link, full_content),
202
  "content_type": content_type,
203
  "date": date,
204
  "content": full_content,