Spaces:

OrganizedProgrammers
/

DocIndexer

Sleeping

App Files Files Community

om4r932 commited on May 30, 2025

Commit

cd278c3

1 Parent(s): abdabaa

Update chapter extraction method

Browse files

Files changed (1) hide show

spec_doc_indexer_multi.py +12 -27

spec_doc_indexer_multi.py CHANGED Viewed

@@ -7,6 +7,7 @@ import requests
 import zipfile
 import uuid
 import os
 import re
 import subprocess
 import concurrent.futures
@@ -48,13 +49,13 @@ def get_text(specification: str, version: str):
     if response.status_code != 200:
         raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
-    zip_bytes = BytesIO(response.content)
     with zipfile.ZipFile(zip_bytes) as zf:
         for file_name in zf.namelist():
             if file_name.endswith("zip"):
                 print("Another ZIP !")
-                zip_bytes = BytesIO(zf.read(file_name))
                 zf = zipfile.ZipFile(zip_bytes)
                 for file_name2 in zf.namelist():
                     if file_name2.endswith("doc") or file_name2.endswith("docx"):
@@ -127,37 +128,20 @@ def get_spec_content(specification: str, version: str):
         if len(forewords) >= 2:
             break
-    toc_brut = text[forewords[0]:forewords[1]]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
-        if re.search(r"^\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
-        if re.search(r"^\d+\.\d+\.\d+.\d+.\d+\t[\ \S]+", line):
-            chapters.append(x[0] if len(x) == 1 else "\t".join(x[:2]))
     real_toc_indexes = {}
     for chapter in chapters:
-        try:
-            x = text.index(chapter)
-            real_toc_indexes[chapter] = x
-        except ValueError as e:
-            try:
-                number = chapter.split("\t")[0] + "\t"
-                for line in text[forewords[1]:]:
-                    if number in line:
-                        x = text.index(line)
-                        real_toc_indexes[line] = x
-                        break
-            except:
-                real_toc_indexes[chapter] = -float("inf")
     document = {}
     toc = list(real_toc_indexes.keys())
@@ -167,7 +151,8 @@ def get_spec_content(specification: str, version: str):
         document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
         curr_index = x
-    document[toc[curr_index].replace("\t"," ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
     return document
 def process_specification(spec: Dict[str, Any], columns: List[str]) -> None:

 import zipfile
 import uuid
 import os
+import io
 import re
 import subprocess
 import concurrent.futures
     if response.status_code != 200:
         raise Exception(f"Téléchargement du ZIP échoué pour {specification}-{version}")
+    zip_bytes = io.BytesIO(response.content)
     with zipfile.ZipFile(zip_bytes) as zf:
         for file_name in zf.namelist():
             if file_name.endswith("zip"):
                 print("Another ZIP !")
+                zip_bytes = io.BytesIO(zf.read(file_name))
                 zf = zipfile.ZipFile(zip_bytes)
                 for file_name2 in zf.namelist():
                     if file_name2.endswith("doc") or file_name2.endswith("docx"):
         if len(forewords) >= 2:
             break
+    toc_brut = text[forewords[1]:]
     chapters = []
     for line in toc_brut:
         x = line.split("\t")
+        m = re.search(r"^(\d+(?:\.\d+)*)\t[\ \S]+$", line)
+        if m and any(line in c for c in text[forewords[0]:forewords[1]]):
+            chapters.append(line)
+            print(line)
     real_toc_indexes = {}
     for chapter in chapters:
+        x = text.index(chapter)
+        real_toc_indexes[chapter] = x
     document = {}
     toc = list(real_toc_indexes.keys())
         document[toc[curr_index].replace("\t", " ")] = re.sub(r"[\ \t]+", " ", "\n".join(text[index_toc[curr_index]+1:index_toc[x]]))
         curr_index = x
+    document[toc[curr_index].replace("\t", " ")] = re.sub(r"\s+", " ", " ".join(text[index_toc[curr_index]+1:]))
+    print(len(toc)-1, toc[curr_index], curr_index)
     return document
 def process_specification(spec: Dict[str, Any], columns: List[str]) -> None: