Spaces:
Sleeping
Sleeping
Upload 16 files
Browse files- build_chroma.py +6 -6
- ena_full_data.json +0 -0
- scraper.py +167 -55
build_chroma.py
CHANGED
|
@@ -13,13 +13,13 @@ load_dotenv()
|
|
| 13 |
def build():
|
| 14 |
# Load Data
|
| 15 |
if not os.path.exists("ena_full_data.json"):
|
| 16 |
-
print("
|
| 17 |
return
|
| 18 |
|
| 19 |
with open("ena_full_data.json", "r", encoding="utf-8") as f:
|
| 20 |
pages = json.load(f)
|
| 21 |
|
| 22 |
-
print(f"
|
| 23 |
|
| 24 |
# Intelligent Chunking
|
| 25 |
# We use specific separators to avoid breaking administrative lists (numbered items)
|
|
@@ -43,7 +43,7 @@ def build():
|
|
| 43 |
"content": chunk
|
| 44 |
})
|
| 45 |
|
| 46 |
-
print(f"
|
| 47 |
|
| 48 |
# Embeddings
|
| 49 |
embeddings = HuggingFaceEmbeddings(
|
|
@@ -56,7 +56,7 @@ def build():
|
|
| 56 |
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 57 |
try:
|
| 58 |
client.delete_collection(COLLECTION_NAME)
|
| 59 |
-
print("
|
| 60 |
except:
|
| 61 |
pass
|
| 62 |
|
|
@@ -79,9 +79,9 @@ def build():
|
|
| 79 |
"category": c["category"]
|
| 80 |
} for c in batch]
|
| 81 |
)
|
| 82 |
-
print(f"
|
| 83 |
|
| 84 |
-
print(f"
|
| 85 |
|
| 86 |
if __name__ == "__main__":
|
| 87 |
build()
|
|
|
|
| 13 |
def build():
|
| 14 |
# Load Data
|
| 15 |
if not os.path.exists("ena_full_data.json"):
|
| 16 |
+
print("Error: ena_full_data.json not found!")
|
| 17 |
return
|
| 18 |
|
| 19 |
with open("ena_full_data.json", "r", encoding="utf-8") as f:
|
| 20 |
pages = json.load(f)
|
| 21 |
|
| 22 |
+
print(f"Loaded {len(pages)} pages.")
|
| 23 |
|
| 24 |
# Intelligent Chunking
|
| 25 |
# We use specific separators to avoid breaking administrative lists (numbered items)
|
|
|
|
| 43 |
"content": chunk
|
| 44 |
})
|
| 45 |
|
| 46 |
+
print(f"Created {len(all_chunks)} chunks.")
|
| 47 |
|
| 48 |
# Embeddings
|
| 49 |
embeddings = HuggingFaceEmbeddings(
|
|
|
|
| 56 |
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 57 |
try:
|
| 58 |
client.delete_collection(COLLECTION_NAME)
|
| 59 |
+
print("Old collection deleted.")
|
| 60 |
except:
|
| 61 |
pass
|
| 62 |
|
|
|
|
| 79 |
"category": c["category"]
|
| 80 |
} for c in batch]
|
| 81 |
)
|
| 82 |
+
print(f"Inserted {min(i+BATCH_SIZE, len(all_chunks))}/{len(all_chunks)}")
|
| 83 |
|
| 84 |
+
print(f"Success! Total documents: {vector_store._collection.count()}")
|
| 85 |
|
| 86 |
if __name__ == "__main__":
|
| 87 |
build()
|
ena_full_data.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scraper.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 4 |
"""
|
| 5 |
from __future__ import annotations
|
| 6 |
|
|
@@ -8,12 +9,45 @@ import json
|
|
| 8 |
import re
|
| 9 |
from collections import deque
|
| 10 |
from typing import Optional
|
| 11 |
-
from urllib.parse import urljoin, urlparse
|
| 12 |
|
| 13 |
import requests
|
| 14 |
from bs4 import BeautifulSoup
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"https://www.ena.tn/ar/",
|
| 18 |
"https://www.ena.tn/fr/",
|
| 19 |
]
|
|
@@ -24,15 +58,36 @@ HEADERS = {
|
|
| 24 |
}
|
| 25 |
|
| 26 |
CATS = {
|
| 27 |
-
"/concours/":
|
| 28 |
-
"/
|
| 29 |
-
"/
|
| 30 |
-
"/
|
| 31 |
-
"/ar/formation":
|
| 32 |
-
"/
|
| 33 |
-
"/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
}
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def _get_cat(u: str) -> str:
|
| 38 |
ul = u.lower()
|
|
@@ -54,32 +109,60 @@ def normalize_url(url: str) -> Optional[str]:
|
|
| 54 |
host = p.netloc.lower()
|
| 55 |
if "ena.tn" not in host:
|
| 56 |
return None
|
|
|
|
|
|
|
|
|
|
| 57 |
path = p.path or "/"
|
| 58 |
query = f"?{p.query}" if p.query else ""
|
| 59 |
return f"https://{host}{path}{query}"
|
| 60 |
|
| 61 |
|
| 62 |
-
def
|
| 63 |
-
low = url.lower()
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def page_lang(url: str) -> str:
|
| 70 |
-
|
| 71 |
-
if "/ar/" in
|
| 72 |
return "ar"
|
| 73 |
-
if "/fr/" in
|
| 74 |
return "fr"
|
| 75 |
return "fr"
|
| 76 |
|
| 77 |
|
| 78 |
-
def
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
|
| 85 |
all_data: list[dict] = []
|
|
@@ -88,48 +171,52 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
|
|
| 88 |
|
| 89 |
for base in base_list:
|
| 90 |
nu = normalize_url(base)
|
| 91 |
-
if nu:
|
| 92 |
queue.append((nu, 0))
|
| 93 |
|
|
|
|
|
|
|
| 94 |
while queue:
|
| 95 |
url, depth = queue.popleft()
|
| 96 |
-
if url in visited:
|
| 97 |
-
continue
|
| 98 |
-
if depth > max_depth:
|
| 99 |
continue
|
| 100 |
visited.add(url)
|
| 101 |
|
| 102 |
try:
|
| 103 |
-
r = requests.get(url, headers=HEADERS, timeout=25)
|
| 104 |
r.raise_for_status()
|
| 105 |
except (requests.RequestException, OSError) as e:
|
| 106 |
-
print(f"skip {url}: {e}")
|
| 107 |
continue
|
| 108 |
|
| 109 |
ctype = (r.headers.get("Content-Type") or "").lower()
|
| 110 |
-
if "html" not in ctype
|
| 111 |
continue
|
| 112 |
|
| 113 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 114 |
text = extract_text(soup)
|
| 115 |
-
if len(text) <
|
| 116 |
continue
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
if depth < max_depth:
|
| 134 |
for a in soup.find_all("a", href=True):
|
| 135 |
href = (a.get("href") or "").strip()
|
|
@@ -137,29 +224,54 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
|
|
| 137 |
continue
|
| 138 |
next_u = urljoin(url, href)
|
| 139 |
nu = normalize_url(next_u)
|
| 140 |
-
if
|
| 141 |
-
continue
|
| 142 |
-
if nu not in visited:
|
| 143 |
queue.append((nu, depth + 1))
|
| 144 |
|
| 145 |
return all_data
|
| 146 |
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
if __name__ == "__main__":
|
| 149 |
-
print("
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
all_data = crawl(BASE_URLS, max_depth=3)
|
| 152 |
|
|
|
|
| 153 |
unique: list[dict] = []
|
| 154 |
seen_texts: set[str] = set()
|
|
|
|
| 155 |
for page in all_data:
|
| 156 |
-
|
|
|
|
| 157 |
unique.append(page)
|
| 158 |
seen_texts.add(page["content"])
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
with open("ena_full_data.json", "w", encoding="utf-8") as f:
|
| 161 |
json.dump(unique, f, ensure_ascii=False, indent=2)
|
| 162 |
|
| 163 |
-
print(
|
| 164 |
-
print(
|
| 165 |
-
print("
|
|
|
|
| 1 |
"""
|
| 2 |
+
ENA Chatbot โ Scraper v3.0 Final
|
| 3 |
+
Crawl www.ena.tn (ar + fr) and save structured text to ena_full_data.json.
|
| 4 |
+
Run: python scraper.py
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
| 9 |
import re
|
| 10 |
from collections import deque
|
| 11 |
from typing import Optional
|
| 12 |
+
from urllib.parse import urljoin, urlparse, unquote
|
| 13 |
|
| 14 |
import requests
|
| 15 |
from bs4 import BeautifulSoup
|
| 16 |
|
| 17 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 18 |
+
# โ๏ธ CONFIG
|
| 19 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
+
|
| 21 |
+
# ุตูุญุงุช ู
ุถู
ููุฉ ุงูุชุญู
ูู โ ุชุจุฏุฃ ู
ููุง ูุจู ุงูู crawl ุงูุนุงู
|
| 22 |
+
PRIORITY_URLS = [
|
| 23 |
+
# ุงูู
ูุงุธุฑุงุช โ ุนุฑุจู
|
| 24 |
+
"https://www.ena.tn/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/",
|
| 25 |
+
"https://www.ena.tn/ar/concours-ar/informations-generales-ar/",
|
| 26 |
+
"https://www.ena.tn/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/",
|
| 27 |
+
"https://www.ena.tn/ar/concours-ar/agents-categorie-a3-ar/",
|
| 28 |
+
"https://www.ena.tn/ar/preparation-au-concours-ar/",
|
| 29 |
+
# ุงูู
ูุงุธุฑุงุช โ ูุฑูุณู
|
| 30 |
+
"https://www.ena.tn/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/",
|
| 31 |
+
"https://www.ena.tn/fr/concours/informations-generales/",
|
| 32 |
+
"https://www.ena.tn/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/",
|
| 33 |
+
"https://www.ena.tn/fr/concours/agents-de-la-sous-categorie-a3/",
|
| 34 |
+
"https://www.ena.tn/fr/concours/cycle-superieur/preparation-au-concours/",
|
| 35 |
+
# ุงูุชูููู ุงูู
ุณุชู
ุฑ โ ุนุฑุจู
|
| 36 |
+
"https://www.ena.tn/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/",
|
| 37 |
+
"https://www.ena.tn/ar/formation-continue-ar/developpement-de-competences-ar/",
|
| 38 |
+
# ุงูุชูููู ุงูู
ุณุชู
ุฑ โ ูุฑูุณู
|
| 39 |
+
"https://www.ena.tn/fr/formation-continue/formation-continue-a-distance-et-presentielle/",
|
| 40 |
+
# ุงูู
ุณุชุฌุฏุงุช ูุงูุฃุฎุจุงุฑ
|
| 41 |
+
"https://www.ena.tn/ar/actualites-ar/",
|
| 42 |
+
"https://www.ena.tn/fr/actualites-fr/",
|
| 43 |
+
"https://www.ena.tn/ar/%d9%85%d8%b3%d8%aa%d8%ac%d8%af%d8%a7%d8%aa/",
|
| 44 |
+
# ุตูุญุงุช ู
ูู
ุฉ 2026
|
| 45 |
+
"https://www.ena.tn/ar/inscription2026/",
|
| 46 |
+
"https://www.ena.tn/ar/ouverturefad2026/",
|
| 47 |
+
"https://www.ena.tn/ar/fad2026/",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
BASE_URLS = PRIORITY_URLS + [
|
| 51 |
"https://www.ena.tn/ar/",
|
| 52 |
"https://www.ena.tn/fr/",
|
| 53 |
]
|
|
|
|
| 58 |
}
|
| 59 |
|
| 60 |
CATS = {
|
| 61 |
+
"/concours/": "concours_fr",
|
| 62 |
+
"/concours-ar": "concours_ar",
|
| 63 |
+
"/ar/concours": "concours_ar",
|
| 64 |
+
"/formation/": "formation_fr",
|
| 65 |
+
"/ar/formation": "formation_ar",
|
| 66 |
+
"/formation-continue":"formation_continue",
|
| 67 |
+
"/gouvernance/": "gouvernance",
|
| 68 |
+
"/ar/service": "service_ar",
|
| 69 |
+
"/actualites/": "news_fr",
|
| 70 |
+
"/actualites-fr/": "news_fr",
|
| 71 |
+
"/actualites-ar/": "news_ar",
|
| 72 |
+
"/evenement": "news_fr",
|
| 73 |
+
"/evenement-ar": "news_ar",
|
| 74 |
+
"/leadership": "leadership",
|
| 75 |
+
"/inscription": "inscription",
|
| 76 |
+
"/fad": "fad",
|
| 77 |
}
|
| 78 |
|
| 79 |
+
# ุตูุญุงุช ูุชุฌุงูููุง โ ู
ุง ุนูุฏูุงุด ู
ุญุชูู ู
ููุฏ
|
| 80 |
+
SKIP_PATTERNS = [
|
| 81 |
+
"wp-admin", "wp-login", "wp-json", "xmlrpc",
|
| 82 |
+
"woocommerce", "cart", "checkout", "my-account",
|
| 83 |
+
"politique-de-confidentialite", "page-d-exemple",
|
| 84 |
+
"elementor", "gravatar", "automattic",
|
| 85 |
+
"log_file", "attachment",
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 89 |
+
# ๐ ๏ธ HELPERS
|
| 90 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 91 |
|
| 92 |
def _get_cat(u: str) -> str:
|
| 93 |
ul = u.lower()
|
|
|
|
| 109 |
host = p.netloc.lower()
|
| 110 |
if "ena.tn" not in host:
|
| 111 |
return None
|
| 112 |
+
# ุชุฌุงูู ุฑูุงุจุท IP ุงูุฏุงุฎููุฉ
|
| 113 |
+
if host.startswith("193."):
|
| 114 |
+
return None
|
| 115 |
path = p.path or "/"
|
| 116 |
query = f"?{p.query}" if p.query else ""
|
| 117 |
return f"https://{host}{path}{query}"
|
| 118 |
|
| 119 |
|
| 120 |
+
def _should_skip(url: str) -> bool:
|
| 121 |
+
low = url.lower()
|
| 122 |
+
# ุชุฌุงูู ุงูู
ููุงุช
|
| 123 |
+
if re.search(r"\.(pdf|jpg|jpeg|png|gif|zip|css|js|ico|svg|woff2?|txt|mp4|mp3)(\?|$)", low.split("?")[0]):
|
| 124 |
+
return True
|
| 125 |
+
# ุชุฌุงูู ุงูุตูุญุงุช ุบูุฑ ุงูู
ููุฏุฉ
|
| 126 |
+
if any(p in low for p in SKIP_PATTERNS):
|
| 127 |
+
return True
|
| 128 |
+
return False
|
| 129 |
|
| 130 |
|
| 131 |
def page_lang(url: str) -> str:
|
| 132 |
+
path = urlparse(url.lower()).path
|
| 133 |
+
if "/ar/" in path or path.startswith("/ar"):
|
| 134 |
return "ar"
|
| 135 |
+
if "/fr/" in path or path.startswith("/fr"):
|
| 136 |
return "fr"
|
| 137 |
return "fr"
|
| 138 |
|
| 139 |
|
| 140 |
+
def get_page_name(url: str) -> str:
|
| 141 |
+
"""ุงุณุชุฎุฑุงุฌ ุงุณู
ุงูุตูุญุฉ ู
ุน ูู ุชุดููุฑ ุงูุนุฑุจูุฉ"""
|
| 142 |
+
path = urlparse(url).path.strip("/")
|
| 143 |
+
raw_name = path.split("/")[-1] if path else "home"
|
| 144 |
+
try:
|
| 145 |
+
return unquote(raw_name)
|
| 146 |
+
except Exception:
|
| 147 |
+
return raw_name
|
| 148 |
+
|
| 149 |
|
| 150 |
+
def extract_text(soup: BeautifulSoup) -> str:
|
| 151 |
+
"""ุงุณุชุฎุฑุงุฌ ุงููุต ุงูุตุงูู ุจุฏูู navigation ูscripts"""
|
| 152 |
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
| 153 |
+
tag.decompose()
|
| 154 |
+
# ุฅุฒุงูุฉ Breadcrumbs
|
| 155 |
+
for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar", re.I)):
|
| 156 |
+
tag.decompose()
|
| 157 |
+
text = soup.get_text(" ", strip=True)
|
| 158 |
+
# ุชูุธูู ุงูู
ุณุงูุงุช ุงูุฒุงุฆุฏุฉ
|
| 159 |
+
text = re.sub(r"\s{3,}", " ", text)
|
| 160 |
+
return text
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 164 |
+
# ๐ท๏ธ CRAWLER
|
| 165 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 166 |
|
| 167 |
def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
|
| 168 |
all_data: list[dict] = []
|
|
|
|
| 171 |
|
| 172 |
for base in base_list:
|
| 173 |
nu = normalize_url(base)
|
| 174 |
+
if nu and not _should_skip(nu):
|
| 175 |
queue.append((nu, 0))
|
| 176 |
|
| 177 |
+
total_fetched = 0
|
| 178 |
+
|
| 179 |
while queue:
|
| 180 |
url, depth = queue.popleft()
|
| 181 |
+
if url in visited or depth > max_depth:
|
|
|
|
|
|
|
| 182 |
continue
|
| 183 |
visited.add(url)
|
| 184 |
|
| 185 |
try:
|
| 186 |
+
r = requests.get(url, headers=HEADERS, timeout=25, allow_redirects=True)
|
| 187 |
r.raise_for_status()
|
| 188 |
except (requests.RequestException, OSError) as e:
|
| 189 |
+
print(f" skip {url[:60]}: {e}")
|
| 190 |
continue
|
| 191 |
|
| 192 |
ctype = (r.headers.get("Content-Type") or "").lower()
|
| 193 |
+
if "html" not in ctype:
|
| 194 |
continue
|
| 195 |
|
| 196 |
soup = BeautifulSoup(r.text, "html.parser")
|
| 197 |
text = extract_text(soup)
|
| 198 |
+
if len(text) < 100:
|
| 199 |
continue
|
| 200 |
|
| 201 |
+
page_name = get_page_name(url)
|
| 202 |
+
category = _get_cat(url)
|
| 203 |
+
lang = page_lang(url)
|
| 204 |
+
|
| 205 |
+
all_data.append({
|
| 206 |
+
"page_name": page_name,
|
| 207 |
+
"url": url,
|
| 208 |
+
"source": "ena.tn",
|
| 209 |
+
"langue": lang,
|
| 210 |
+
"category": category,
|
| 211 |
+
"content": text,
|
| 212 |
+
"chars": len(text),
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
total_fetched += 1
|
| 216 |
+
if total_fetched % 20 == 0:
|
| 217 |
+
print(f" {total_fetched} pages fetched...")
|
| 218 |
+
|
| 219 |
+
# ุชุงุจุน ุงูู links ุฅุฐุง ู
ุง ูุตููุงุด ููุนู
ู ุงูุฃูุตู
|
| 220 |
if depth < max_depth:
|
| 221 |
for a in soup.find_all("a", href=True):
|
| 222 |
href = (a.get("href") or "").strip()
|
|
|
|
| 224 |
continue
|
| 225 |
next_u = urljoin(url, href)
|
| 226 |
nu = normalize_url(next_u)
|
| 227 |
+
if nu and not _should_skip(nu) and nu not in visited:
|
|
|
|
|
|
|
| 228 |
queue.append((nu, depth + 1))
|
| 229 |
|
| 230 |
return all_data
|
| 231 |
|
| 232 |
|
| 233 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 234 |
+
# ๐ MAIN
|
| 235 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 236 |
+
|
| 237 |
if __name__ == "__main__":
|
| 238 |
+
print("=" * 60)
|
| 239 |
+
print("ENA Scraper v3.0 -- Starting crawl...")
|
| 240 |
+
print(f" Priority URLs: {len(PRIORITY_URLS)}")
|
| 241 |
+
print("=" * 60)
|
| 242 |
|
| 243 |
all_data = crawl(BASE_URLS, max_depth=3)
|
| 244 |
|
| 245 |
+
# ุฅุฒุงูุฉ ุงูู
ูุฑุฑุงุช
|
| 246 |
unique: list[dict] = []
|
| 247 |
seen_texts: set[str] = set()
|
| 248 |
+
seen_urls: set[str] = set()
|
| 249 |
for page in all_data:
|
| 250 |
+
# ุชุฌูุจ ุชูุฑุงุฑ ููุณ ุงููุต ุฃู ููุณ ุงูู URL
|
| 251 |
+
if page["content"] not in seen_texts and page["url"] not in seen_urls:
|
| 252 |
unique.append(page)
|
| 253 |
seen_texts.add(page["content"])
|
| 254 |
+
seen_urls.add(page["url"])
|
| 255 |
+
|
| 256 |
+
# ุฅุญุตุงุฆูุงุช
|
| 257 |
+
print("\n" + "=" * 60)
|
| 258 |
+
print(f"OK. Pages collected: {len(unique)}")
|
| 259 |
+
print(f"Total characters: {sum(p['chars'] for p in unique):,}")
|
| 260 |
+
|
| 261 |
+
from collections import Counter
|
| 262 |
+
cats = Counter(p["category"] for p in unique)
|
| 263 |
+
langs = Counter(p["langue"] for p in unique)
|
| 264 |
+
print("\nBy category:")
|
| 265 |
+
for cat, count in cats.most_common():
|
| 266 |
+
print(f" {cat}: {count}")
|
| 267 |
+
print("\nBy language:")
|
| 268 |
+
for lang, count in langs.items():
|
| 269 |
+
print(f" {lang}: {count}")
|
| 270 |
+
|
| 271 |
+
# ุญูุธ
|
| 272 |
with open("ena_full_data.json", "w", encoding="utf-8") as f:
|
| 273 |
json.dump(unique, f, ensure_ascii=False, indent=2)
|
| 274 |
|
| 275 |
+
print("\nSaved to ena_full_data.json")
|
| 276 |
+
print("=" * 60)
|
| 277 |
+
print("Done! Now run: python build_chroma.py")
|