Spaces:

OrganizedProgrammers
/

DocIndexer-v2

Running

App Files Files Community

Almaatla commited on 18 days ago

Commit

7f043e9

verified ·

1 Parent(s): 53bb3f7

Upload 2 files

Browse files

Files changed (2) hide show

app.py +230 -164
classes.py +106 -64

app.py CHANGED Viewed

@@ -1,165 +1,231 @@
-from fastapi.staticfiles import StaticFiles
-import requests, re, warnings
-from dotenv import load_dotenv
-from fastapi import FastAPI, Request, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse, StreamingResponse
-from bs4 import BeautifulSoup
-import httpx
-from huggingface_hub.utils import set_client_factory
-from schemas import *
-from classes import *
-def hf_client_factory() -> httpx.Client:
-    return httpx.Client(verify=False)
-set_client_factory(hf_client_factory)
-warnings.filterwarnings("ignore")
-load_dotenv()
-meetings_mapping = {
-    "SA": [
-        "TSG_SA",
-        "WG1_Serv",
-        "WG2_Arch",
-        "WG3_Security",
-        "WG4_CODEC",
-        "WG5_TM",
-        "WG6_MissionCritical"
-    ],
-    "CT": [
-        "TSG_CT",
-        "WG1_mm-cc-sm_ex-CN1",
-        "WG2_capability_ex-T2",
-        "WG3_interworking_ex-CN3",
-        "WG4_protocollars_ex-CN4",
-        "WG5_osa_ex-CN5",
-        "WG6_Smartcard_Ex-T3"
-    ],
-    "RAN": [
-        "TSG_RAN",
-        "WG1_RL1",
-        "WG2_RL2",
-        "WG3_Iu",
-        "WG4_Radio",
-        "WG5_Test_ex-T1",
-        "WG6_legacyRAN"
-    ]
-}
-tdoc_indexer = TDocIndexer()
-spec_3gpp_indexer = Spec3GPPIndexer()
-spec_etsi_indexer = SpecETSIIndexer()
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
-app.mount("/static", StaticFiles(directory="static"), name="static")
-@app.get('/')
-def main():
-    return FileResponse("index.html")
-def get_folder_name(working_group: str):
-    if working_group.endswith("P"):
-        if working_group.startswith("S"):
-            return ("SA", 0)
-        if working_group.startswith("C"):
-            return ("CT", 0)
-        if working_group.startswith("R"):
-            return ("RAN", 0)
-    m = re.match(r"([A-Z]+)(\d+)", working_group)
-    if m:
-        code, num = m.groups()
-        return (code, int(num))
-    else:
-        raise ValueError("Unattended format")
-@app.get("/get_meetings/{working_group}")
-def get_meetings(working_group: str):
-    category, wg_number = get_folder_name(working_group)
-    folder = meetings_mapping[category][wg_number]
-    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
-    response = requests.get(url, verify=False)
-    responseHTML = response.text
-    soup = BeautifulSoup(responseHTML, "html.parser")
-    return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
-@app.post("/index_tdocs/working_group")
-def index_tdocs_wg_progress(req: IndexTDoc):
-    if not req.wg:
-        raise HTTPException(status_code=400, detail="Working Group not defined !")
-    category, wg_number = get_folder_name(req.wg)
-    folder = meetings_mapping[category][wg_number]
-    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
-    def generate_events():
-        yield f"event: info\ndata: {req.wg}\n\n"
-        for content in tdoc_indexer.process_workgroup(folder, url):
-            yield content
-        tdoc_indexer.save_indexer()
-        yield "event: end\ndata: Indexation ended successfully !\n\n"
-    return StreamingResponse(generate_events(), media_type="text/event-stream")
-@app.post("/index_tdocs/meeting")
-def index_tdocs_meeting_progress(req: IndexTDoc):
-    if not req.wg:
-        raise HTTPException(status_code=400, detail="Working Group not defined !")
-    if not req.meetings:
-        raise HTTPException(status_code=400, detail="Meetings not defined !")
-    category, wg_number = get_folder_name(req.wg)
-    folder = meetings_mapping[category][wg_number]
-    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
-    def generate_events():
-        yield f"event: get-maximum\ndata: {len(req.meetings)}\n\n"
-        for i, meet in enumerate(req.meetings):
-            yield f"event: info\ndata: {req.wg}-{meet}\n\n"
-            tdoc_indexer.process_meeting(meet, url)
-            yield f"event: progress\ndata: {i+1}\n\n"
-        tdoc_indexer.save_indexer()
-        yield "event: end\ndata: Indexation ended successfully !\n\n"
-    return StreamingResponse(generate_events(), media_type="text/event-stream")
-@app.post("/index_tdocs/all")
-def index_all_tdocs_progress():
-    def generate_events():
-        for content in tdoc_indexer.index_all_tdocs():
-            yield content
-        tdoc_indexer.save_indexer()
-        yield "event: end\ndata: Indexation ended successfully !\n\n"
-    return StreamingResponse(generate_events(), media_type="text/event-stream")
-@app.post("/index_specs/3gpp")
-def index_3gpp_specs_progress():
-    def generate_events():
-        for content in spec_3gpp_indexer.run():
-            yield content
-        yield "event: info\ndata: Saving index ...\n\n"
-        yield "event: get-maximum\ndata: 1\n\n"
-        yield "event: progress\ndata: 1\n\n"
-        spec_3gpp_indexer.save()
-        yield "event: info\ndata: Creating BM25 models ...\n\n"
-        yield "event: get-maximum\ndata: 1\n\n"
-        yield "event: progress\ndata: 1\n\n"
-        spec_3gpp_indexer.create_bm25_index()
-        yield "event: end\ndata: Indexation ended successfully !\n\n"
-    return StreamingResponse(generate_events(), media_type="text/event-stream")
-@app.post("/index_specs/etsi")
-def index_etsi_specs_progress():
-    def generate_events():
-        for content in spec_etsi_indexer.run():
-            yield content
-        yield "event: info\ndata: Saving index ...\n\n"
-        yield "event: get-maximum\ndata: 1\n\n"
-        yield "event: progress\ndata: 1\n\n"
-        spec_etsi_indexer.save()
-        yield "event: info\ndata: Creating BM25 models ...\n\n"
-        yield "event: get-maximum\ndata: 1\n\n"
-        yield "event: progress\ndata: 1\n\n"
-        spec_etsi_indexer.create_bm25_index()
-        yield "event: end\ndata: Indexation ended successfully !\n\n"
     return StreamingResponse(generate_events(), media_type="text/event-stream")

+from fastapi.staticfiles import StaticFiles
+import requests, re, warnings
+from dotenv import load_dotenv
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, StreamingResponse
+from bs4 import BeautifulSoup
+import httpx
+from huggingface_hub.utils import set_client_factory
+from schemas import *
+from classes import *
+def hf_client_factory() -> httpx.Client:
+    return httpx.Client(verify=False)
+set_client_factory(hf_client_factory)
+warnings.filterwarnings("ignore")
+load_dotenv()
+meetings_mapping = {
+    "SA": [
+        "TSG_SA",
+        "WG1_Serv",
+        "WG2_Arch",
+        "WG3_Security",
+        "WG4_CODEC",
+        "WG5_TM",
+        "WG6_MissionCritical"
+    ],
+    "CT": [
+        "TSG_CT",
+        "WG1_mm-cc-sm_ex-CN1",
+        "WG2_capability_ex-T2",
+        "WG3_interworking_ex-CN3",
+        "WG4_protocollars_ex-CN4",
+        "WG5_osa_ex-CN5",
+        "WG6_Smartcard_Ex-T3"
+    ],
+    "RAN": [
+        "TSG_RAN",
+        "WG1_RL1",
+        "WG2_RL2",
+        "WG3_Iu",
+        "WG4_Radio",
+        "WG5_Test_ex-T1",
+        "WG6_legacyRAN"
+    ]
+}
+import threading
+_tdoc_indexer = None
+_spec_3gpp_indexer = None
+_spec_etsi_indexer = None
+_init_locks = {
+    "tdoc": threading.Lock(),
+    "3gpp": threading.Lock(),
+    "etsi": threading.Lock(),
+}
+_indexing_locks = {
+    "tdoc": threading.Lock(),
+    "3gpp": threading.Lock(),
+    "etsi": threading.Lock(),
+}
+def get_tdoc_indexer():
+    global _tdoc_indexer
+    if _tdoc_indexer is None:
+        with _init_locks["tdoc"]:
+            if _tdoc_indexer is None:
+                _tdoc_indexer = TDocIndexer()
+    return _tdoc_indexer
+def get_spec_3gpp_indexer():
+    global _spec_3gpp_indexer
+    if _spec_3gpp_indexer is None:
+        with _init_locks["3gpp"]:
+            if _spec_3gpp_indexer is None:
+                _spec_3gpp_indexer = Spec3GPPIndexer()
+    return _spec_3gpp_indexer
+def get_spec_etsi_indexer():
+    global _spec_etsi_indexer
+    if _spec_etsi_indexer is None:
+        with _init_locks["etsi"]:
+            if _spec_etsi_indexer is None:
+                _spec_etsi_indexer = SpecETSIIndexer()
+    return _spec_etsi_indexer
+app = FastAPI()
+app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
+app.mount("/static", StaticFiles(directory="static"), name="static")
+@app.get('/')
+def main():
+    return FileResponse("index.html")
+def get_folder_name(working_group: str):
+    if working_group.endswith("P"):
+        if working_group.startswith("S"):
+            return ("SA", 0)
+        if working_group.startswith("C"):
+            return ("CT", 0)
+        if working_group.startswith("R"):
+            return ("RAN", 0)
+    m = re.match(r"([A-Z]+)(\d+)", working_group)
+    if m:
+        code, num = m.groups()
+        return (code, int(num))
+    else:
+        raise ValueError("Unattended format")
+@app.get("/get_meetings/{working_group}")
+def get_meetings(working_group: str):
+    category, wg_number = get_folder_name(working_group)
+    folder = meetings_mapping[category][wg_number]
+    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
+    response = requests.get(url, verify=False, timeout=(10, 30))
+    responseHTML = response.text
+    soup = BeautifulSoup(responseHTML, "html.parser")
+    return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
+@app.post("/index_tdocs/working_group")
+def index_tdocs_wg_progress(req: IndexTDoc):
+    if not req.wg:
+        raise HTTPException(status_code=400, detail="Working Group not defined !")
+    if not _indexing_locks["tdoc"].acquire(blocking=False):
+        raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
+    category, wg_number = get_folder_name(req.wg)
+    folder = meetings_mapping[category][wg_number]
+    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
+    indexer = get_tdoc_indexer()
+    def generate_events():
+        try:
+            yield f"event: info\ndata: {req.wg}\n\n"
+            for content in indexer.process_workgroup(folder, url):
+                yield content
+            indexer.save_indexer()
+            yield "event: end\ndata: Indexation ended successfully !\n\n"
+        finally:
+            _indexing_locks["tdoc"].release()
+    return StreamingResponse(generate_events(), media_type="text/event-stream")
+@app.post("/index_tdocs/meeting")
+def index_tdocs_meeting_progress(req: IndexTDoc):
+    if not req.wg:
+        raise HTTPException(status_code=400, detail="Working Group not defined !")
+    if not req.meetings:
+        raise HTTPException(status_code=400, detail="Meetings not defined !")
+    if not _indexing_locks["tdoc"].acquire(blocking=False):
+        raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
+    category, wg_number = get_folder_name(req.wg)
+    folder = meetings_mapping[category][wg_number]
+    url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
+    indexer = get_tdoc_indexer()
+    def generate_events():
+        try:
+            yield f"event: get-maximum\ndata: {len(req.meetings)}\n\n"
+            for i, meet in enumerate(req.meetings):
+                yield f"event: info\ndata: {req.wg}-{meet}\n\n"
+                indexer.process_meeting(meet, url)
+                yield f"event: progress\ndata: {i+1}\n\n"
+            indexer.save_indexer()
+            yield "event: end\ndata: Indexation ended successfully !\n\n"
+        finally:
+            _indexing_locks["tdoc"].release()
+    return StreamingResponse(generate_events(), media_type="text/event-stream")
+@app.post("/index_tdocs/all")
+def index_all_tdocs_progress():
+    if not _indexing_locks["tdoc"].acquire(blocking=False):
+        raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
+    indexer = get_tdoc_indexer()
+    def generate_events():
+        try:
+            for content in indexer.index_all_tdocs():
+                yield content
+            indexer.save_indexer()
+            yield "event: end\ndata: Indexation ended successfully !\n\n"
+        finally:
+            _indexing_locks["tdoc"].release()
+    return StreamingResponse(generate_events(), media_type="text/event-stream")
+@app.post("/index_specs/3gpp")
+def index_3gpp_specs_progress():
+    if not _indexing_locks["3gpp"].acquire(blocking=False):
+        raise HTTPException(status_code=409, detail="3GPP spec indexing already in progress")
+    indexer = get_spec_3gpp_indexer()
+    def generate_events():
+        try:
+            for content in indexer.run():
+                yield content
+            yield "event: info\ndata: Saving index ...\n\n"
+            yield "event: get-maximum\ndata: 1\n\n"
+            yield "event: progress\ndata: 1\n\n"
+            indexer.save()
+            yield "event: info\ndata: Creating BM25 models ...\n\n"
+            yield "event: get-maximum\ndata: 1\n\n"
+            yield "event: progress\ndata: 1\n\n"
+            indexer.create_bm25_index()
+            yield "event: end\ndata: Indexation ended successfully !\n\n"
+        finally:
+            _indexing_locks["3gpp"].release()
+    return StreamingResponse(generate_events(), media_type="text/event-stream")
+@app.post("/index_specs/etsi")
+def index_etsi_specs_progress():
+    if not _indexing_locks["etsi"].acquire(blocking=False):
+        raise HTTPException(status_code=409, detail="ETSI spec indexing already in progress")
+    indexer = get_spec_etsi_indexer()
+    def generate_events():
+        try:
+            for content in indexer.run():
+                yield content
+            yield "event: info\ndata: Saving index ...\n\n"
+            yield "event: get-maximum\ndata: 1\n\n"
+            yield "event: progress\ndata: 1\n\n"
+            indexer.save()
+            yield "event: info\ndata: Creating BM25 models ...\n\n"
+            yield "event: get-maximum\ndata: 1\n\n"
+            yield "event: progress\ndata: 1\n\n"
+            indexer.create_bm25_index()
+            yield "event: end\ndata: Indexation ended successfully !\n\n"
+        finally:
+            _indexing_locks["etsi"].release()
     return StreamingResponse(generate_events(), media_type="text/event-stream")

classes.py CHANGED Viewed

@@ -119,13 +119,12 @@ class TDocIndexer:
             meeting_contents = self.get_docs_from_url(meeting_url)
             key = None
-            if "docs" in [x.lower() for x in meeting_contents]:
-                key = "docs"
-            elif "tdocs" in [x.lower() for x in meeting_contents]:
-                key = "tdocs"
-            elif "tdoc" in [x.lower() for x in meeting_contents]:
-                key = "tdoc"
             if key is not None:
                 docs_url = f"{meeting_url}/{key}"
@@ -143,8 +142,13 @@ class TDocIndexer:
                         print(f"{docs_indexed_count} fichiers trouvés")
                 # 2. Vérifier le sous-dossier ZIP s'il existe
-                if "zip" in [x.lower() for x in docs_files]:
-                    zip_url = f"{docs_url}/zip"
                     with self.print_lock:
                         print(f"Vérification du dossier ./zip: {zip_url}")
@@ -333,7 +337,7 @@ class Spec3GPPIndexer:
         url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
         try:
-            response = requests.get(url, verify=False)
             if response.status_code != 200:
                 return []
@@ -407,7 +411,8 @@ class Spec3GPPIndexer:
         response = requests.get(
             'https://www.3gpp.org/dynareport?code=status-report.htm',
             headers={"User-Agent": 'Mozilla/5.0'},
-            verify=False
         )
         dfs = pd.read_html(io.StringIO(response.text))
         for x in range(len(dfs)):
@@ -427,8 +432,14 @@ class Spec3GPPIndexer:
         if self.STOP_EVENT.is_set():
             return
         try:
-            doc_id = str(spec['spec_num'])
-            version_code = self.version_to_code(str(spec['vers']))
             if not version_code:
                 with self.DICT_LOCK:
                     self.processed_count += 1
@@ -436,21 +447,27 @@ class Spec3GPPIndexer:
             document = None
             already_indexed = False
             with self.DOCUMENT_LOCK:
-                doc_in_cache = doc_id in self.documents_by_spec_num and \
-                               self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)
-            if doc_in_cache and doc_id not in self.specifications_passed:
-                document = self.documents_by_spec_num[doc_id]
-                self.specifications_passed.add(doc_id)
-                already_indexed = True
-            elif doc_id not in self.specifications_passed:
                 doc_content = self.get_spec_content(doc_id, version_code)
                 if doc_content:
                     document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
                     with self.DOCUMENT_LOCK:
                         self.documents_by_spec_num[doc_id] = document
-                        self.specifications_passed.add(doc_id)
                     already_indexed = False
             if document:
@@ -482,9 +499,10 @@ class Spec3GPPIndexer:
     def get_document(self, spec_id: str, spec_title: str):
         text = [f"{spec_id} - {spec_title}\n"]
-        for section in self.spec_contents:
-            if spec_id == section["doc_id"]:
-                text.extend([f"{section['section']}\n\n{section['content']}"])
         return text
     def create_bm25_index(self):
@@ -494,19 +512,21 @@ class Spec3GPPIndexer:
         for specification in dataset_metadata:
             if specification['id'] in unique_specs: continue
-            for section in self.spec_contents:
-                if specification['id'] == section['doc_id']:
-                    corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
                 "id": specification['id'],
                 "title": specification['title'],
-                "section_title": section['section'],
                 "version": specification['version'],
                 "type": specification['type'],
                 "working_group": specification['working_group'],
                 "url": specification['url'],
                 "scope": specification['scope']
             }})
         corpus_text = [doc["text"] for doc in corpus_json]
         corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
@@ -612,13 +632,14 @@ class SpecETSIIndexer:
             verify=False,
             headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
             data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
         )
         print("Récupération des métadonnées TS/TR …")
         url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
         url_tr = url_ts.replace("stdType=TS", "stdType=TR")
-        data_ts = self.session.get(url_ts, verify=False).content
-        data_tr = self.session.get(url_tr, verify=False).content
         df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
         df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
@@ -640,12 +661,11 @@ class SpecETSIIndexer:
         df_ts["Type"] = "TS"
         df_tr["Type"] = "TR"
         df = pd.concat([df_ts, df_tr])
         unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
         unique_df = unique_df.drop(columns="temp")
         unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
-        df = df.drop(columns="temp")
-        df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
-        return df
     @staticmethod
     def hasher(specification: str, version: str):
@@ -660,11 +680,12 @@ class SpecETSIIndexer:
     def get_document(self, spec_id: str, spec_title: str):
         text = [f"{spec_id} - {spec_title}\n"]
-        for section in self.spec_contents:
-            if spec_id == section["doc_id"]:
-                text.extend([f"{section['section']}\n\n{section['content']}"])
         return text
     def get_text(self, specification: str):
         if self.STOP_EVENT.is_set():
             return None, []
@@ -679,7 +700,8 @@ class SpecETSIIndexer:
             pdf_link = row.iloc[0]["PDF link"]
             response = self.session.get(
                 pdf_link,
-                headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'}
             )
             if response.status_code != 200:
                 print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
@@ -733,40 +755,58 @@ class SpecETSIIndexer:
     def process_specification(self, spec):
         if self.STOP_EVENT.is_set():
             return
         try:
             version = spec.get('Version')
-            if not version: return
-            doc_id = str(spec.get("ETSI deliverable"))
             document = None
             already_indexed = False
             with self.DOCUMENT_LOCK:
-                if (doc_id in self.documents_by_spec_num
-                    and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)
-                    and doc_id not in self.specifications_passed):
-                    document = self.documents_by_spec_num[doc_id]
-                    self.specifications_passed.add(doc_id)
                     already_indexed = True
-                elif doc_id in self.specifications_passed:
                     document = self.documents_by_spec_num[doc_id]
                     already_indexed = True
                 else:
-                    document_content = self.get_spec_content(doc_id)
-                    if document_content:
-                        self.documents_by_spec_num[doc_id] = {"content": document_content, "hash": self.hasher(doc_id, version)}
-                        document = {"content": document_content, "hash": self.hasher(doc_id, version)}
-                        self.specifications_passed.add(doc_id)
-                        already_indexed = False
             if document:
-                string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
                 metadata = {
                     "id": str(doc_id),
-                    "title": spec["title"],
-                    "type": spec["Type"],
                     "version": version,
-                    "url": spec["PDF link"],
-                    "scope": "" if not document else self.get_scope(document["content"])
                 }
                 with self.DICT_LOCK:
                     self.indexed_specifications[string_key] = metadata
@@ -827,18 +867,20 @@ class SpecETSIIndexer:
         for specification in dataset_metadata:
             if specification['id'] in unique_specs: continue
-            for section in self.spec_contents:
-                if specification['id'] == section['doc_id']:
-                    corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
                 "id": specification['id'],
                 "title": specification['title'],
-                "section_title": section['section'],
                 "version": specification['version'],
                 "type": specification['type'],
                 "url": specification['url'],
                 "scope": specification['scope']
             }})
         corpus_text = [doc["text"] for doc in corpus_json]
         corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")

             meeting_contents = self.get_docs_from_url(meeting_url)
             key = None
+            for item in meeting_contents:
+                normalized = item.lower().rstrip('/')
+                if normalized in ("docs", "tdocs", "tdoc"):
+                    key = item.rstrip('/')
+                    break
             if key is not None:
                 docs_url = f"{meeting_url}/{key}"
                         print(f"{docs_indexed_count} fichiers trouvés")
                 # 2. Vérifier le sous-dossier ZIP s'il existe
+                zip_folder = None
+                for item in docs_files:
+                    if item.lower().rstrip('/') == "zip":
+                        zip_folder = item.rstrip('/')
+                        break
+                if zip_folder:
+                    zip_url = f"{docs_url}/{zip_folder}"
                     with self.print_lock:
                         print(f"Vérification du dossier ./zip: {zip_url}")
         url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
         try:
+            response = requests.get(url, verify=False, timeout=(10, 120))
             if response.status_code != 200:
                 return []
         response = requests.get(
             'https://www.3gpp.org/dynareport?code=status-report.htm',
             headers={"User-Agent": 'Mozilla/5.0'},
+            verify=False,
+            timeout=(10, 60)
         )
         dfs = pd.read_html(io.StringIO(response.text))
         for x in range(len(dfs)):
         if self.STOP_EVENT.is_set():
             return
         try:
+            spec_num = spec.get('spec_num')
+            vers = spec.get('vers')
+            if spec_num is None or vers is None:
+                with self.DICT_LOCK:
+                    self.processed_count += 1
+                return
+            doc_id = str(spec_num)
+            version_code = self.version_to_code(str(vers))
             if not version_code:
                 with self.DICT_LOCK:
                     self.processed_count += 1
             document = None
             already_indexed = False
+            needs_fetch = False
             with self.DOCUMENT_LOCK:
+                if doc_id in self.specifications_passed:
+                    document = self.documents_by_spec_num.get(doc_id)
+                    already_indexed = True
+                elif (doc_id in self.documents_by_spec_num
+                      and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)):
+                    document = self.documents_by_spec_num[doc_id]
+                    self.specifications_passed.add(doc_id)
+                    already_indexed = True
+                else:
+                    self.specifications_passed.add(doc_id)
+                    needs_fetch = True
+            if needs_fetch:
                 doc_content = self.get_spec_content(doc_id, version_code)
                 if doc_content:
                     document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
                     with self.DOCUMENT_LOCK:
                         self.documents_by_spec_num[doc_id] = document
                     already_indexed = False
             if document:
     def get_document(self, spec_id: str, spec_title: str):
         text = [f"{spec_id} - {spec_title}\n"]
+        doc_data = self.documents_by_spec_num.get(spec_id)
+        if doc_data:
+            for section_title, content in doc_data["content"].items():
+                text.append(f"{section_title}\n\n{content}")
         return text
     def create_bm25_index(self):
         for specification in dataset_metadata:
             if specification['id'] in unique_specs: continue
+            unique_specs.add(specification['id'])
+            doc_data = self.documents_by_spec_num.get(specification['id'])
+            if doc_data:
+                for section_title, content in doc_data["content"].items():
+                    corpus_json.append({"text": f"{section_title}\n{content}", "metadata": {
                 "id": specification['id'],
                 "title": specification['title'],
+                "section_title": section_title,
                 "version": specification['version'],
                 "type": specification['type'],
                 "working_group": specification['working_group'],
                 "url": specification['url'],
                 "scope": specification['scope']
             }})
         corpus_text = [doc["text"] for doc in corpus_json]
         corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
             verify=False,
             headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
             data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
+            timeout=(10, 30),
         )
         print("Récupération des métadonnées TS/TR …")
         url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
         url_tr = url_ts.replace("stdType=TS", "stdType=TR")
+        data_ts = self.session.get(url_ts, verify=False, timeout=(10, 120)).content
+        data_tr = self.session.get(url_tr, verify=False, timeout=(10, 120)).content
         df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
         df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
         df_ts["Type"] = "TS"
         df_tr["Type"] = "TR"
         df = pd.concat([df_ts, df_tr])
+        df = df.dropna(subset=["ETSI deliverable", "Version"])
         unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
         unique_df = unique_df.drop(columns="temp")
         unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
+        return unique_df
     @staticmethod
     def hasher(specification: str, version: str):
     def get_document(self, spec_id: str, spec_title: str):
         text = [f"{spec_id} - {spec_title}\n"]
+        doc_data = self.documents_by_spec_num.get(spec_id)
+        if doc_data:
+            for section_title, content in doc_data["content"].items():
+                text.append(f"{section_title}\n\n{content}")
         return text
     def get_text(self, specification: str):
         if self.STOP_EVENT.is_set():
             return None, []
             pdf_link = row.iloc[0]["PDF link"]
             response = self.session.get(
                 pdf_link,
+                headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'},
+                timeout=(10, 120)
             )
             if response.status_code != 200:
                 print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
     def process_specification(self, spec):
         if self.STOP_EVENT.is_set():
             return
+        doc_id = "unknown"
         try:
             version = spec.get('Version')
+            if not version or (isinstance(version, float) and pd.isna(version)):
+                with self.DICT_LOCK:
+                    self.processed_count += 1
+                return
+            doc_id = spec.get("ETSI deliverable")
+            if not doc_id or (isinstance(doc_id, float) and pd.isna(doc_id)):
+                with self.DICT_LOCK:
+                    self.processed_count += 1
+                return
+            doc_id = str(doc_id)
             document = None
             already_indexed = False
+            needs_fetch = False
             with self.DOCUMENT_LOCK:
+                if doc_id in self.specifications_passed:
+                    document = self.documents_by_spec_num.get(doc_id)
                     already_indexed = True
+                elif (doc_id in self.documents_by_spec_num
+                      and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)):
                     document = self.documents_by_spec_num[doc_id]
+                    self.specifications_passed.add(doc_id)
                     already_indexed = True
                 else:
+                    self.specifications_passed.add(doc_id)
+                    needs_fetch = True
+            if needs_fetch:
+                document_content = self.get_spec_content(doc_id)
+                if document_content:
+                    document = {"content": document_content, "hash": self.hasher(doc_id, version)}
+                    with self.DOCUMENT_LOCK:
+                        self.documents_by_spec_num[doc_id] = document
+                    already_indexed = False
             if document:
+                title = spec.get("title", "")
+                if isinstance(title, float) and pd.isna(title):
+                    title = ""
+                spec_type = spec.get("Type", "")
+                pdf_link = spec.get("PDF link", "")
+                string_key = f"{doc_id}+-+{title}+-+{spec_type}+-+{version}"
                 metadata = {
                     "id": str(doc_id),
+                    "title": title,
+                    "type": spec_type,
                     "version": version,
+                    "url": pdf_link,
+                    "scope": self.get_scope(document["content"])
                 }
                 with self.DICT_LOCK:
                     self.indexed_specifications[string_key] = metadata
         for specification in dataset_metadata:
             if specification['id'] in unique_specs: continue
+            unique_specs.add(specification['id'])
+            doc_data = self.documents_by_spec_num.get(specification['id'])
+            if doc_data:
+                for section_title, content in doc_data["content"].items():
+                    corpus_json.append({"text": f"{section_title}\n{content}", "metadata": {
                 "id": specification['id'],
                 "title": specification['title'],
+                "section_title": section_title,
                 "version": specification['version'],
                 "type": specification['type'],
                 "url": specification['url'],
                 "scope": specification['scope']
             }})
         corpus_text = [doc["text"] for doc in corpus_json]
         corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")