Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- build_chroma.py +3 -4
- ena_full_data.json +0 -0
- scraper_api.py +20 -2
build_chroma.py
CHANGED
|
@@ -21,11 +21,10 @@ def build():
|
|
| 21 |
|
| 22 |
print(f"Loaded {len(pages)} pages.")
|
| 23 |
|
| 24 |
-
# Intelligent Chunking
|
| 25 |
-
# We use specific separators to avoid breaking administrative lists (numbered items)
|
| 26 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 27 |
-
chunk_size=
|
| 28 |
-
chunk_overlap=
|
| 29 |
separators=["\n\n", "\n", " - ", " * ", ". ", " "]
|
| 30 |
)
|
| 31 |
|
|
|
|
| 21 |
|
| 22 |
print(f"Loaded {len(pages)} pages.")
|
| 23 |
|
| 24 |
+
# Intelligent Chunking โ larger chunks to keep legal lists intact
|
|
|
|
| 25 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 26 |
+
chunk_size=1000, # was 500 โ doubled to avoid splitting condition lists
|
| 27 |
+
chunk_overlap=200, # was 100 โ increased to preserve context between chunks
|
| 28 |
separators=["\n\n", "\n", " - ", " * ", ". ", " "]
|
| 29 |
)
|
| 30 |
|
ena_full_data.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scraper_api.py
CHANGED
|
@@ -91,11 +91,29 @@ SKIP_SLUGS = [
|
|
| 91 |
# ๐ ๏ธ HELPERS
|
| 92 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
ul = url.lower()
|
|
|
|
| 96 |
for p, c in CATS.items():
|
| 97 |
if p in ul:
|
| 98 |
return c
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
return "other"
|
| 100 |
|
| 101 |
def get_lang(url: str) -> str:
|
|
@@ -180,7 +198,7 @@ def process_api_items(items: list[dict], content_type: str) -> list[dict]:
|
|
| 180 |
"url": link,
|
| 181 |
"source": "ena.tn-api",
|
| 182 |
"langue": get_lang(link),
|
| 183 |
-
"category": get_category(link),
|
| 184 |
"content_type": content_type,
|
| 185 |
"date": date,
|
| 186 |
"content": full_content,
|
|
|
|
| 91 |
# ๐ ๏ธ HELPERS
|
| 92 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 93 |
|
| 94 |
+
CONTENT_CATS = {
|
| 95 |
+
"concours_ar": ["ู
ูุงุธุฑุฉ", "ุชุฑุดุญ", "ุดุฑูุท ุงูุฏุฎูู", "ุจูุงุน", "ุงุฎุชุจุงุฑ", "ู
ุฑุญูุฉ ุนููุง", "ู
ุฑุญูุฉ ู
ุชูุณุทุฉ", "ุฃุนูุงู"],
|
| 96 |
+
"concours_fr": ["concours", "candidature", "cycle supรฉrieur", "cycle moyen", "รฉpreuve", "places"],
|
| 97 |
+
"formation_continue": ["ุชูููู ู
ุณุชู
ุฑ", "formation continue", "fad", "ุชูููู ุนู ุจุนุฏ"],
|
| 98 |
+
"formation_ar": ["ุชูููู", "ุจุฑูุงู
ุฌ", "ุชุฃููู", "cycle de formation"],
|
| 99 |
+
"news_ar": ["ุฅุนูุงู", "ุจูุงุบ", "ุฃุฎุจุงุฑ", "ู
ุณุชุฌุฏุงุช", "ุฅุนูุงู
"],
|
| 100 |
+
"news_fr": ["actualitรฉ", "communiquรฉ", "annonce", "information"],
|
| 101 |
+
"inscription": ["ุชุณุฌูู", "inscription", "2026"],
|
| 102 |
+
"fad": ["fad", "ุชุนููู
ุนู ุจุนุฏ", "formation ร distance"],
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def get_category(url: str, text: str = "") -> str:
|
| 106 |
ul = url.lower()
|
| 107 |
+
# First try URL-based matching (most reliable)
|
| 108 |
for p, c in CATS.items():
|
| 109 |
if p in ul:
|
| 110 |
return c
|
| 111 |
+
# Then try content-based matching for API pages
|
| 112 |
+
if text:
|
| 113 |
+
tl = text.lower()
|
| 114 |
+
for cat, keywords in CONTENT_CATS.items():
|
| 115 |
+
if any(kw in tl for kw in keywords):
|
| 116 |
+
return cat
|
| 117 |
return "other"
|
| 118 |
|
| 119 |
def get_lang(url: str) -> str:
|
|
|
|
| 198 |
"url": link,
|
| 199 |
"source": "ena.tn-api",
|
| 200 |
"langue": get_lang(link),
|
| 201 |
+
"category": get_category(link, full_content),
|
| 202 |
"content_type": content_type,
|
| 203 |
"date": date,
|
| 204 |
"content": full_content,
|