Almaatla commited on
Commit
7f043e9
·
verified ·
1 Parent(s): 53bb3f7

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +230 -164
  2. classes.py +106 -64
app.py CHANGED
@@ -1,165 +1,231 @@
1
- from fastapi.staticfiles import StaticFiles
2
- import requests, re, warnings
3
- from dotenv import load_dotenv
4
- from fastapi import FastAPI, Request, HTTPException
5
- from fastapi.middleware.cors import CORSMiddleware
6
- from fastapi.responses import FileResponse, StreamingResponse
7
- from bs4 import BeautifulSoup
8
-
9
- import httpx
10
- from huggingface_hub.utils import set_client_factory
11
-
12
- from schemas import *
13
- from classes import *
14
-
15
- def hf_client_factory() -> httpx.Client:
16
- return httpx.Client(verify=False)
17
-
18
- set_client_factory(hf_client_factory)
19
-
20
- warnings.filterwarnings("ignore")
21
- load_dotenv()
22
-
23
- meetings_mapping = {
24
- "SA": [
25
- "TSG_SA",
26
- "WG1_Serv",
27
- "WG2_Arch",
28
- "WG3_Security",
29
- "WG4_CODEC",
30
- "WG5_TM",
31
- "WG6_MissionCritical"
32
- ],
33
- "CT": [
34
- "TSG_CT",
35
- "WG1_mm-cc-sm_ex-CN1",
36
- "WG2_capability_ex-T2",
37
- "WG3_interworking_ex-CN3",
38
- "WG4_protocollars_ex-CN4",
39
- "WG5_osa_ex-CN5",
40
- "WG6_Smartcard_Ex-T3"
41
- ],
42
- "RAN": [
43
- "TSG_RAN",
44
- "WG1_RL1",
45
- "WG2_RL2",
46
- "WG3_Iu",
47
- "WG4_Radio",
48
- "WG5_Test_ex-T1",
49
- "WG6_legacyRAN"
50
- ]
51
- }
52
-
53
- tdoc_indexer = TDocIndexer()
54
- spec_3gpp_indexer = Spec3GPPIndexer()
55
- spec_etsi_indexer = SpecETSIIndexer()
56
-
57
- app = FastAPI()
58
- app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
59
- app.mount("/static", StaticFiles(directory="static"), name="static")
60
-
61
- @app.get('/')
62
- def main():
63
- return FileResponse("index.html")
64
- def get_folder_name(working_group: str):
65
- if working_group.endswith("P"):
66
- if working_group.startswith("S"):
67
- return ("SA", 0)
68
- if working_group.startswith("C"):
69
- return ("CT", 0)
70
- if working_group.startswith("R"):
71
- return ("RAN", 0)
72
- m = re.match(r"([A-Z]+)(\d+)", working_group)
73
- if m:
74
- code, num = m.groups()
75
- return (code, int(num))
76
- else:
77
- raise ValueError("Unattended format")
78
-
79
- @app.get("/get_meetings/{working_group}")
80
- def get_meetings(working_group: str):
81
- category, wg_number = get_folder_name(working_group)
82
- folder = meetings_mapping[category][wg_number]
83
- url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
84
- response = requests.get(url, verify=False)
85
- responseHTML = response.text
86
- soup = BeautifulSoup(responseHTML, "html.parser")
87
- return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
88
-
89
- @app.post("/index_tdocs/working_group")
90
- def index_tdocs_wg_progress(req: IndexTDoc):
91
- if not req.wg:
92
- raise HTTPException(status_code=400, detail="Working Group not defined !")
93
- category, wg_number = get_folder_name(req.wg)
94
- folder = meetings_mapping[category][wg_number]
95
- url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
96
- def generate_events():
97
- yield f"event: info\ndata: {req.wg}\n\n"
98
- for content in tdoc_indexer.process_workgroup(folder, url):
99
- yield content
100
- tdoc_indexer.save_indexer()
101
- yield "event: end\ndata: Indexation ended successfully !\n\n"
102
- return StreamingResponse(generate_events(), media_type="text/event-stream")
103
-
104
- @app.post("/index_tdocs/meeting")
105
- def index_tdocs_meeting_progress(req: IndexTDoc):
106
- if not req.wg:
107
- raise HTTPException(status_code=400, detail="Working Group not defined !")
108
- if not req.meetings:
109
- raise HTTPException(status_code=400, detail="Meetings not defined !")
110
-
111
- category, wg_number = get_folder_name(req.wg)
112
- folder = meetings_mapping[category][wg_number]
113
- url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
114
- def generate_events():
115
- yield f"event: get-maximum\ndata: {len(req.meetings)}\n\n"
116
- for i, meet in enumerate(req.meetings):
117
- yield f"event: info\ndata: {req.wg}-{meet}\n\n"
118
- tdoc_indexer.process_meeting(meet, url)
119
- yield f"event: progress\ndata: {i+1}\n\n"
120
- tdoc_indexer.save_indexer()
121
- yield "event: end\ndata: Indexation ended successfully !\n\n"
122
- return StreamingResponse(generate_events(), media_type="text/event-stream")
123
-
124
-
125
- @app.post("/index_tdocs/all")
126
- def index_all_tdocs_progress():
127
- def generate_events():
128
- for content in tdoc_indexer.index_all_tdocs():
129
- yield content
130
- tdoc_indexer.save_indexer()
131
- yield "event: end\ndata: Indexation ended successfully !\n\n"
132
- return StreamingResponse(generate_events(), media_type="text/event-stream")
133
-
134
-
135
- @app.post("/index_specs/3gpp")
136
- def index_3gpp_specs_progress():
137
- def generate_events():
138
- for content in spec_3gpp_indexer.run():
139
- yield content
140
- yield "event: info\ndata: Saving index ...\n\n"
141
- yield "event: get-maximum\ndata: 1\n\n"
142
- yield "event: progress\ndata: 1\n\n"
143
- spec_3gpp_indexer.save()
144
- yield "event: info\ndata: Creating BM25 models ...\n\n"
145
- yield "event: get-maximum\ndata: 1\n\n"
146
- yield "event: progress\ndata: 1\n\n"
147
- spec_3gpp_indexer.create_bm25_index()
148
- yield "event: end\ndata: Indexation ended successfully !\n\n"
149
- return StreamingResponse(generate_events(), media_type="text/event-stream")
150
-
151
- @app.post("/index_specs/etsi")
152
- def index_etsi_specs_progress():
153
- def generate_events():
154
- for content in spec_etsi_indexer.run():
155
- yield content
156
- yield "event: info\ndata: Saving index ...\n\n"
157
- yield "event: get-maximum\ndata: 1\n\n"
158
- yield "event: progress\ndata: 1\n\n"
159
- spec_etsi_indexer.save()
160
- yield "event: info\ndata: Creating BM25 models ...\n\n"
161
- yield "event: get-maximum\ndata: 1\n\n"
162
- yield "event: progress\ndata: 1\n\n"
163
- spec_etsi_indexer.create_bm25_index()
164
- yield "event: end\ndata: Indexation ended successfully !\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  return StreamingResponse(generate_events(), media_type="text/event-stream")
 
1
+ from fastapi.staticfiles import StaticFiles
2
+ import requests, re, warnings
3
+ from dotenv import load_dotenv
4
+ from fastapi import FastAPI, Request, HTTPException
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import FileResponse, StreamingResponse
7
+ from bs4 import BeautifulSoup
8
+
9
+ import httpx
10
+ from huggingface_hub.utils import set_client_factory
11
+
12
+ from schemas import *
13
+ from classes import *
14
+
15
+ def hf_client_factory() -> httpx.Client:
16
+ return httpx.Client(verify=False)
17
+
18
+ set_client_factory(hf_client_factory)
19
+
20
+ warnings.filterwarnings("ignore")
21
+ load_dotenv()
22
+
23
+ meetings_mapping = {
24
+ "SA": [
25
+ "TSG_SA",
26
+ "WG1_Serv",
27
+ "WG2_Arch",
28
+ "WG3_Security",
29
+ "WG4_CODEC",
30
+ "WG5_TM",
31
+ "WG6_MissionCritical"
32
+ ],
33
+ "CT": [
34
+ "TSG_CT",
35
+ "WG1_mm-cc-sm_ex-CN1",
36
+ "WG2_capability_ex-T2",
37
+ "WG3_interworking_ex-CN3",
38
+ "WG4_protocollars_ex-CN4",
39
+ "WG5_osa_ex-CN5",
40
+ "WG6_Smartcard_Ex-T3"
41
+ ],
42
+ "RAN": [
43
+ "TSG_RAN",
44
+ "WG1_RL1",
45
+ "WG2_RL2",
46
+ "WG3_Iu",
47
+ "WG4_Radio",
48
+ "WG5_Test_ex-T1",
49
+ "WG6_legacyRAN"
50
+ ]
51
+ }
52
+
53
+ import threading
54
+
55
+ _tdoc_indexer = None
56
+ _spec_3gpp_indexer = None
57
+ _spec_etsi_indexer = None
58
+ _init_locks = {
59
+ "tdoc": threading.Lock(),
60
+ "3gpp": threading.Lock(),
61
+ "etsi": threading.Lock(),
62
+ }
63
+ _indexing_locks = {
64
+ "tdoc": threading.Lock(),
65
+ "3gpp": threading.Lock(),
66
+ "etsi": threading.Lock(),
67
+ }
68
+
69
+ def get_tdoc_indexer():
70
+ global _tdoc_indexer
71
+ if _tdoc_indexer is None:
72
+ with _init_locks["tdoc"]:
73
+ if _tdoc_indexer is None:
74
+ _tdoc_indexer = TDocIndexer()
75
+ return _tdoc_indexer
76
+
77
+ def get_spec_3gpp_indexer():
78
+ global _spec_3gpp_indexer
79
+ if _spec_3gpp_indexer is None:
80
+ with _init_locks["3gpp"]:
81
+ if _spec_3gpp_indexer is None:
82
+ _spec_3gpp_indexer = Spec3GPPIndexer()
83
+ return _spec_3gpp_indexer
84
+
85
+ def get_spec_etsi_indexer():
86
+ global _spec_etsi_indexer
87
+ if _spec_etsi_indexer is None:
88
+ with _init_locks["etsi"]:
89
+ if _spec_etsi_indexer is None:
90
+ _spec_etsi_indexer = SpecETSIIndexer()
91
+ return _spec_etsi_indexer
92
+
93
+ app = FastAPI()
94
+ app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
95
+ app.mount("/static", StaticFiles(directory="static"), name="static")
96
+
97
+ @app.get('/')
98
+ def main():
99
+ return FileResponse("index.html")
100
+ def get_folder_name(working_group: str):
101
+ if working_group.endswith("P"):
102
+ if working_group.startswith("S"):
103
+ return ("SA", 0)
104
+ if working_group.startswith("C"):
105
+ return ("CT", 0)
106
+ if working_group.startswith("R"):
107
+ return ("RAN", 0)
108
+ m = re.match(r"([A-Z]+)(\d+)", working_group)
109
+ if m:
110
+ code, num = m.groups()
111
+ return (code, int(num))
112
+ else:
113
+ raise ValueError("Unattended format")
114
+
115
+ @app.get("/get_meetings/{working_group}")
116
+ def get_meetings(working_group: str):
117
+ category, wg_number = get_folder_name(working_group)
118
+ folder = meetings_mapping[category][wg_number]
119
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
120
+ response = requests.get(url, verify=False, timeout=(10, 30))
121
+ responseHTML = response.text
122
+ soup = BeautifulSoup(responseHTML, "html.parser")
123
+ return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
124
+
125
+ @app.post("/index_tdocs/working_group")
126
+ def index_tdocs_wg_progress(req: IndexTDoc):
127
+ if not req.wg:
128
+ raise HTTPException(status_code=400, detail="Working Group not defined !")
129
+ if not _indexing_locks["tdoc"].acquire(blocking=False):
130
+ raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
131
+ category, wg_number = get_folder_name(req.wg)
132
+ folder = meetings_mapping[category][wg_number]
133
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
134
+ indexer = get_tdoc_indexer()
135
+ def generate_events():
136
+ try:
137
+ yield f"event: info\ndata: {req.wg}\n\n"
138
+ for content in indexer.process_workgroup(folder, url):
139
+ yield content
140
+ indexer.save_indexer()
141
+ yield "event: end\ndata: Indexation ended successfully !\n\n"
142
+ finally:
143
+ _indexing_locks["tdoc"].release()
144
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
145
+
146
+ @app.post("/index_tdocs/meeting")
147
+ def index_tdocs_meeting_progress(req: IndexTDoc):
148
+ if not req.wg:
149
+ raise HTTPException(status_code=400, detail="Working Group not defined !")
150
+ if not req.meetings:
151
+ raise HTTPException(status_code=400, detail="Meetings not defined !")
152
+ if not _indexing_locks["tdoc"].acquire(blocking=False):
153
+ raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
154
+
155
+ category, wg_number = get_folder_name(req.wg)
156
+ folder = meetings_mapping[category][wg_number]
157
+ url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
158
+ indexer = get_tdoc_indexer()
159
+ def generate_events():
160
+ try:
161
+ yield f"event: get-maximum\ndata: {len(req.meetings)}\n\n"
162
+ for i, meet in enumerate(req.meetings):
163
+ yield f"event: info\ndata: {req.wg}-{meet}\n\n"
164
+ indexer.process_meeting(meet, url)
165
+ yield f"event: progress\ndata: {i+1}\n\n"
166
+ indexer.save_indexer()
167
+ yield "event: end\ndata: Indexation ended successfully !\n\n"
168
+ finally:
169
+ _indexing_locks["tdoc"].release()
170
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
171
+
172
+
173
+ @app.post("/index_tdocs/all")
174
+ def index_all_tdocs_progress():
175
+ if not _indexing_locks["tdoc"].acquire(blocking=False):
176
+ raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
177
+ indexer = get_tdoc_indexer()
178
+ def generate_events():
179
+ try:
180
+ for content in indexer.index_all_tdocs():
181
+ yield content
182
+ indexer.save_indexer()
183
+ yield "event: end\ndata: Indexation ended successfully !\n\n"
184
+ finally:
185
+ _indexing_locks["tdoc"].release()
186
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
187
+
188
+
189
+ @app.post("/index_specs/3gpp")
190
+ def index_3gpp_specs_progress():
191
+ if not _indexing_locks["3gpp"].acquire(blocking=False):
192
+ raise HTTPException(status_code=409, detail="3GPP spec indexing already in progress")
193
+ indexer = get_spec_3gpp_indexer()
194
+ def generate_events():
195
+ try:
196
+ for content in indexer.run():
197
+ yield content
198
+ yield "event: info\ndata: Saving index ...\n\n"
199
+ yield "event: get-maximum\ndata: 1\n\n"
200
+ yield "event: progress\ndata: 1\n\n"
201
+ indexer.save()
202
+ yield "event: info\ndata: Creating BM25 models ...\n\n"
203
+ yield "event: get-maximum\ndata: 1\n\n"
204
+ yield "event: progress\ndata: 1\n\n"
205
+ indexer.create_bm25_index()
206
+ yield "event: end\ndata: Indexation ended successfully !\n\n"
207
+ finally:
208
+ _indexing_locks["3gpp"].release()
209
+ return StreamingResponse(generate_events(), media_type="text/event-stream")
210
+
211
+ @app.post("/index_specs/etsi")
212
+ def index_etsi_specs_progress():
213
+ if not _indexing_locks["etsi"].acquire(blocking=False):
214
+ raise HTTPException(status_code=409, detail="ETSI spec indexing already in progress")
215
+ indexer = get_spec_etsi_indexer()
216
+ def generate_events():
217
+ try:
218
+ for content in indexer.run():
219
+ yield content
220
+ yield "event: info\ndata: Saving index ...\n\n"
221
+ yield "event: get-maximum\ndata: 1\n\n"
222
+ yield "event: progress\ndata: 1\n\n"
223
+ indexer.save()
224
+ yield "event: info\ndata: Creating BM25 models ...\n\n"
225
+ yield "event: get-maximum\ndata: 1\n\n"
226
+ yield "event: progress\ndata: 1\n\n"
227
+ indexer.create_bm25_index()
228
+ yield "event: end\ndata: Indexation ended successfully !\n\n"
229
+ finally:
230
+ _indexing_locks["etsi"].release()
231
  return StreamingResponse(generate_events(), media_type="text/event-stream")
classes.py CHANGED
@@ -119,13 +119,12 @@ class TDocIndexer:
119
  meeting_contents = self.get_docs_from_url(meeting_url)
120
 
121
  key = None
122
- if "docs" in [x.lower() for x in meeting_contents]:
123
- key = "docs"
124
- elif "tdocs" in [x.lower() for x in meeting_contents]:
125
- key = "tdocs"
126
- elif "tdoc" in [x.lower() for x in meeting_contents]:
127
- key = "tdoc"
128
-
129
  if key is not None:
130
  docs_url = f"{meeting_url}/{key}"
131
 
@@ -143,8 +142,13 @@ class TDocIndexer:
143
  print(f"{docs_indexed_count} fichiers trouvés")
144
 
145
  # 2. Vérifier le sous-dossier ZIP s'il existe
146
- if "zip" in [x.lower() for x in docs_files]:
147
- zip_url = f"{docs_url}/zip"
 
 
 
 
 
148
 
149
  with self.print_lock:
150
  print(f"Vérification du dossier ./zip: {zip_url}")
@@ -333,7 +337,7 @@ class Spec3GPPIndexer:
333
  url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
334
 
335
  try:
336
- response = requests.get(url, verify=False)
337
  if response.status_code != 200:
338
  return []
339
 
@@ -407,7 +411,8 @@ class Spec3GPPIndexer:
407
  response = requests.get(
408
  'https://www.3gpp.org/dynareport?code=status-report.htm',
409
  headers={"User-Agent": 'Mozilla/5.0'},
410
- verify=False
 
411
  )
412
  dfs = pd.read_html(io.StringIO(response.text))
413
  for x in range(len(dfs)):
@@ -427,8 +432,14 @@ class Spec3GPPIndexer:
427
  if self.STOP_EVENT.is_set():
428
  return
429
  try:
430
- doc_id = str(spec['spec_num'])
431
- version_code = self.version_to_code(str(spec['vers']))
 
 
 
 
 
 
432
  if not version_code:
433
  with self.DICT_LOCK:
434
  self.processed_count += 1
@@ -436,21 +447,27 @@ class Spec3GPPIndexer:
436
 
437
  document = None
438
  already_indexed = False
 
 
439
  with self.DOCUMENT_LOCK:
440
- doc_in_cache = doc_id in self.documents_by_spec_num and \
441
- self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)
442
-
443
- if doc_in_cache and doc_id not in self.specifications_passed:
444
- document = self.documents_by_spec_num[doc_id]
445
- self.specifications_passed.add(doc_id)
446
- already_indexed = True
447
- elif doc_id not in self.specifications_passed:
 
 
 
 
 
448
  doc_content = self.get_spec_content(doc_id, version_code)
449
  if doc_content:
450
  document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
451
  with self.DOCUMENT_LOCK:
452
  self.documents_by_spec_num[doc_id] = document
453
- self.specifications_passed.add(doc_id)
454
  already_indexed = False
455
 
456
  if document:
@@ -482,9 +499,10 @@ class Spec3GPPIndexer:
482
 
483
  def get_document(self, spec_id: str, spec_title: str):
484
  text = [f"{spec_id} - {spec_title}\n"]
485
- for section in self.spec_contents:
486
- if spec_id == section["doc_id"]:
487
- text.extend([f"{section['section']}\n\n{section['content']}"])
 
488
  return text
489
 
490
  def create_bm25_index(self):
@@ -494,19 +512,21 @@ class Spec3GPPIndexer:
494
 
495
  for specification in dataset_metadata:
496
  if specification['id'] in unique_specs: continue
497
- for section in self.spec_contents:
498
- if specification['id'] == section['doc_id']:
499
- corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
 
 
500
  "id": specification['id'],
501
  "title": specification['title'],
502
- "section_title": section['section'],
503
  "version": specification['version'],
504
  "type": specification['type'],
505
  "working_group": specification['working_group'],
506
  "url": specification['url'],
507
  "scope": specification['scope']
508
  }})
509
-
510
  corpus_text = [doc["text"] for doc in corpus_json]
511
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
512
 
@@ -612,13 +632,14 @@ class SpecETSIIndexer:
612
  verify=False,
613
  headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
614
  data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
 
615
  )
616
 
617
  print("Récupération des métadonnées TS/TR …")
618
  url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
619
  url_tr = url_ts.replace("stdType=TS", "stdType=TR")
620
- data_ts = self.session.get(url_ts, verify=False).content
621
- data_tr = self.session.get(url_tr, verify=False).content
622
  df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
623
  df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
624
 
@@ -640,12 +661,11 @@ class SpecETSIIndexer:
640
  df_ts["Type"] = "TS"
641
  df_tr["Type"] = "TR"
642
  df = pd.concat([df_ts, df_tr])
 
643
  unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
644
  unique_df = unique_df.drop(columns="temp")
645
  unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
646
- df = df.drop(columns="temp")
647
- df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
648
- return df
649
 
650
  @staticmethod
651
  def hasher(specification: str, version: str):
@@ -660,11 +680,12 @@ class SpecETSIIndexer:
660
 
661
  def get_document(self, spec_id: str, spec_title: str):
662
  text = [f"{spec_id} - {spec_title}\n"]
663
- for section in self.spec_contents:
664
- if spec_id == section["doc_id"]:
665
- text.extend([f"{section['section']}\n\n{section['content']}"])
 
666
  return text
667
-
668
  def get_text(self, specification: str):
669
  if self.STOP_EVENT.is_set():
670
  return None, []
@@ -679,7 +700,8 @@ class SpecETSIIndexer:
679
  pdf_link = row.iloc[0]["PDF link"]
680
  response = self.session.get(
681
  pdf_link,
682
- headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'}
 
683
  )
684
  if response.status_code != 200:
685
  print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
@@ -733,40 +755,58 @@ class SpecETSIIndexer:
733
  def process_specification(self, spec):
734
  if self.STOP_EVENT.is_set():
735
  return
 
736
  try:
737
  version = spec.get('Version')
738
- if not version: return
739
- doc_id = str(spec.get("ETSI deliverable"))
 
 
 
 
 
 
 
 
740
  document = None
741
  already_indexed = False
 
742
 
743
  with self.DOCUMENT_LOCK:
744
- if (doc_id in self.documents_by_spec_num
745
- and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)
746
- and doc_id not in self.specifications_passed):
747
- document = self.documents_by_spec_num[doc_id]
748
- self.specifications_passed.add(doc_id)
749
  already_indexed = True
750
- elif doc_id in self.specifications_passed:
 
751
  document = self.documents_by_spec_num[doc_id]
 
752
  already_indexed = True
753
  else:
754
- document_content = self.get_spec_content(doc_id)
755
- if document_content:
756
- self.documents_by_spec_num[doc_id] = {"content": document_content, "hash": self.hasher(doc_id, version)}
757
- document = {"content": document_content, "hash": self.hasher(doc_id, version)}
758
- self.specifications_passed.add(doc_id)
759
- already_indexed = False
 
 
 
 
760
 
761
  if document:
762
- string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
 
 
 
 
 
763
  metadata = {
764
  "id": str(doc_id),
765
- "title": spec["title"],
766
- "type": spec["Type"],
767
  "version": version,
768
- "url": spec["PDF link"],
769
- "scope": "" if not document else self.get_scope(document["content"])
770
  }
771
  with self.DICT_LOCK:
772
  self.indexed_specifications[string_key] = metadata
@@ -827,18 +867,20 @@ class SpecETSIIndexer:
827
 
828
  for specification in dataset_metadata:
829
  if specification['id'] in unique_specs: continue
830
- for section in self.spec_contents:
831
- if specification['id'] == section['doc_id']:
832
- corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
 
 
833
  "id": specification['id'],
834
  "title": specification['title'],
835
- "section_title": section['section'],
836
  "version": specification['version'],
837
  "type": specification['type'],
838
  "url": specification['url'],
839
  "scope": specification['scope']
840
  }})
841
-
842
  corpus_text = [doc["text"] for doc in corpus_json]
843
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
844
 
 
119
  meeting_contents = self.get_docs_from_url(meeting_url)
120
 
121
  key = None
122
+ for item in meeting_contents:
123
+ normalized = item.lower().rstrip('/')
124
+ if normalized in ("docs", "tdocs", "tdoc"):
125
+ key = item.rstrip('/')
126
+ break
127
+
 
128
  if key is not None:
129
  docs_url = f"{meeting_url}/{key}"
130
 
 
142
  print(f"{docs_indexed_count} fichiers trouvés")
143
 
144
  # 2. Vérifier le sous-dossier ZIP s'il existe
145
+ zip_folder = None
146
+ for item in docs_files:
147
+ if item.lower().rstrip('/') == "zip":
148
+ zip_folder = item.rstrip('/')
149
+ break
150
+ if zip_folder:
151
+ zip_url = f"{docs_url}/{zip_folder}"
152
 
153
  with self.print_lock:
154
  print(f"Vérification du dossier ./zip: {zip_url}")
 
337
  url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
338
 
339
  try:
340
+ response = requests.get(url, verify=False, timeout=(10, 120))
341
  if response.status_code != 200:
342
  return []
343
 
 
411
  response = requests.get(
412
  'https://www.3gpp.org/dynareport?code=status-report.htm',
413
  headers={"User-Agent": 'Mozilla/5.0'},
414
+ verify=False,
415
+ timeout=(10, 60)
416
  )
417
  dfs = pd.read_html(io.StringIO(response.text))
418
  for x in range(len(dfs)):
 
432
  if self.STOP_EVENT.is_set():
433
  return
434
  try:
435
+ spec_num = spec.get('spec_num')
436
+ vers = spec.get('vers')
437
+ if spec_num is None or vers is None:
438
+ with self.DICT_LOCK:
439
+ self.processed_count += 1
440
+ return
441
+ doc_id = str(spec_num)
442
+ version_code = self.version_to_code(str(vers))
443
  if not version_code:
444
  with self.DICT_LOCK:
445
  self.processed_count += 1
 
447
 
448
  document = None
449
  already_indexed = False
450
+ needs_fetch = False
451
+
452
  with self.DOCUMENT_LOCK:
453
+ if doc_id in self.specifications_passed:
454
+ document = self.documents_by_spec_num.get(doc_id)
455
+ already_indexed = True
456
+ elif (doc_id in self.documents_by_spec_num
457
+ and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)):
458
+ document = self.documents_by_spec_num[doc_id]
459
+ self.specifications_passed.add(doc_id)
460
+ already_indexed = True
461
+ else:
462
+ self.specifications_passed.add(doc_id)
463
+ needs_fetch = True
464
+
465
+ if needs_fetch:
466
  doc_content = self.get_spec_content(doc_id, version_code)
467
  if doc_content:
468
  document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
469
  with self.DOCUMENT_LOCK:
470
  self.documents_by_spec_num[doc_id] = document
 
471
  already_indexed = False
472
 
473
  if document:
 
499
 
500
  def get_document(self, spec_id: str, spec_title: str):
501
  text = [f"{spec_id} - {spec_title}\n"]
502
+ doc_data = self.documents_by_spec_num.get(spec_id)
503
+ if doc_data:
504
+ for section_title, content in doc_data["content"].items():
505
+ text.append(f"{section_title}\n\n{content}")
506
  return text
507
 
508
  def create_bm25_index(self):
 
512
 
513
  for specification in dataset_metadata:
514
  if specification['id'] in unique_specs: continue
515
+ unique_specs.add(specification['id'])
516
+ doc_data = self.documents_by_spec_num.get(specification['id'])
517
+ if doc_data:
518
+ for section_title, content in doc_data["content"].items():
519
+ corpus_json.append({"text": f"{section_title}\n{content}", "metadata": {
520
  "id": specification['id'],
521
  "title": specification['title'],
522
+ "section_title": section_title,
523
  "version": specification['version'],
524
  "type": specification['type'],
525
  "working_group": specification['working_group'],
526
  "url": specification['url'],
527
  "scope": specification['scope']
528
  }})
529
+
530
  corpus_text = [doc["text"] for doc in corpus_json]
531
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
532
 
 
632
  verify=False,
633
  headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
634
  data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
635
+ timeout=(10, 30),
636
  )
637
 
638
  print("Récupération des métadonnées TS/TR …")
639
  url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
640
  url_tr = url_ts.replace("stdType=TS", "stdType=TR")
641
+ data_ts = self.session.get(url_ts, verify=False, timeout=(10, 120)).content
642
+ data_tr = self.session.get(url_tr, verify=False, timeout=(10, 120)).content
643
  df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
644
  df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
645
 
 
661
  df_ts["Type"] = "TS"
662
  df_tr["Type"] = "TR"
663
  df = pd.concat([df_ts, df_tr])
664
+ df = df.dropna(subset=["ETSI deliverable", "Version"])
665
  unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
666
  unique_df = unique_df.drop(columns="temp")
667
  unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
668
+ return unique_df
 
 
669
 
670
  @staticmethod
671
  def hasher(specification: str, version: str):
 
680
 
681
  def get_document(self, spec_id: str, spec_title: str):
682
  text = [f"{spec_id} - {spec_title}\n"]
683
+ doc_data = self.documents_by_spec_num.get(spec_id)
684
+ if doc_data:
685
+ for section_title, content in doc_data["content"].items():
686
+ text.append(f"{section_title}\n\n{content}")
687
  return text
688
+
689
  def get_text(self, specification: str):
690
  if self.STOP_EVENT.is_set():
691
  return None, []
 
700
  pdf_link = row.iloc[0]["PDF link"]
701
  response = self.session.get(
702
  pdf_link,
703
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'},
704
+ timeout=(10, 120)
705
  )
706
  if response.status_code != 200:
707
  print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
 
755
  def process_specification(self, spec):
756
  if self.STOP_EVENT.is_set():
757
  return
758
+ doc_id = "unknown"
759
  try:
760
  version = spec.get('Version')
761
+ if not version or (isinstance(version, float) and pd.isna(version)):
762
+ with self.DICT_LOCK:
763
+ self.processed_count += 1
764
+ return
765
+ doc_id = spec.get("ETSI deliverable")
766
+ if not doc_id or (isinstance(doc_id, float) and pd.isna(doc_id)):
767
+ with self.DICT_LOCK:
768
+ self.processed_count += 1
769
+ return
770
+ doc_id = str(doc_id)
771
  document = None
772
  already_indexed = False
773
+ needs_fetch = False
774
 
775
  with self.DOCUMENT_LOCK:
776
+ if doc_id in self.specifications_passed:
777
+ document = self.documents_by_spec_num.get(doc_id)
 
 
 
778
  already_indexed = True
779
+ elif (doc_id in self.documents_by_spec_num
780
+ and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)):
781
  document = self.documents_by_spec_num[doc_id]
782
+ self.specifications_passed.add(doc_id)
783
  already_indexed = True
784
  else:
785
+ self.specifications_passed.add(doc_id)
786
+ needs_fetch = True
787
+
788
+ if needs_fetch:
789
+ document_content = self.get_spec_content(doc_id)
790
+ if document_content:
791
+ document = {"content": document_content, "hash": self.hasher(doc_id, version)}
792
+ with self.DOCUMENT_LOCK:
793
+ self.documents_by_spec_num[doc_id] = document
794
+ already_indexed = False
795
 
796
  if document:
797
+ title = spec.get("title", "")
798
+ if isinstance(title, float) and pd.isna(title):
799
+ title = ""
800
+ spec_type = spec.get("Type", "")
801
+ pdf_link = spec.get("PDF link", "")
802
+ string_key = f"{doc_id}+-+{title}+-+{spec_type}+-+{version}"
803
  metadata = {
804
  "id": str(doc_id),
805
+ "title": title,
806
+ "type": spec_type,
807
  "version": version,
808
+ "url": pdf_link,
809
+ "scope": self.get_scope(document["content"])
810
  }
811
  with self.DICT_LOCK:
812
  self.indexed_specifications[string_key] = metadata
 
867
 
868
  for specification in dataset_metadata:
869
  if specification['id'] in unique_specs: continue
870
+ unique_specs.add(specification['id'])
871
+ doc_data = self.documents_by_spec_num.get(specification['id'])
872
+ if doc_data:
873
+ for section_title, content in doc_data["content"].items():
874
+ corpus_json.append({"text": f"{section_title}\n{content}", "metadata": {
875
  "id": specification['id'],
876
  "title": specification['title'],
877
+ "section_title": section_title,
878
  "version": specification['version'],
879
  "type": specification['type'],
880
  "url": specification['url'],
881
  "scope": specification['scope']
882
  }})
883
+
884
  corpus_text = [doc["text"] for doc in corpus_json]
885
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
886