Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

e175021

verified ·

1 Parent(s): ae7976c

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -52

app.py CHANGED Viewed

@@ -2,18 +2,18 @@ import gradio as gr
 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
-from sklearn.cluster import KMeans
 # -------------------------------------------------
-# OCR
 # -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
-    use_textline_orientation=True
 )
 # -------------------------------------------------
-# Normalisation texte (casse + accents)
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
@@ -22,7 +22,7 @@ def normalize(text: str) -> str:
     return " ".join(text.split())
 # -------------------------------------------------
-# Titres valides de la colonne 2
 # -------------------------------------------------
 COL_TITLES = {
     "designation",
@@ -32,7 +32,7 @@ COL_TITLES = {
 }
 # -------------------------------------------------
-# Mots / lignes à ignorer
 # -------------------------------------------------
 IGNORE_KEYWORDS = {
     "prix", "total", "ht", "htva", "tva",
@@ -40,72 +40,58 @@ IGNORE_KEYWORDS = {
 }
 # -------------------------------------------------
-# Fonction principale
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
-    result = ocr.predict(img)
-    if not result:
         return "OCR : aucun texte détecté."
-    data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
     blocks = []
-    for text, box in zip(texts, boxes):
-        t = text.strip()
-        if len(t) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
-        blocks.append((t, x, y))
-    if len(blocks) < 5:
-        return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # 1. Détection du X de la colonne cible via son titre
     # -------------------------------------------------
-    col_x = None
     for text, x, y in blocks:
         if normalize(text) in COL_TITLES:
-            col_x = x
             break
     if col_x is None:
-        return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
-    # 2. Sélection des blocs proches du X détecté
     # -------------------------------------------------
-    X_THRESHOLD = 45
     column_blocks = [
         (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD
     ]
-    if not column_blocks:
-        return "Colonne détectée mais vide."
-    # -------------------------------------------------
-    # 3. Tri vertical (haut → bas)
-    # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 4. Fusion intelligente des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
-    Y_THRESHOLD = 22
     for text, x, y in column_blocks:
         nt = normalize(text)
@@ -113,7 +99,13 @@ def extract_second_column(image):
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
             current = text
@@ -126,23 +118,19 @@ def extract_second_column(image):
         merged.append(current.strip())
     # -------------------------------------------------
-    # 5. Nettoyage final (cellules texte uniquement)
     # -------------------------------------------------
     final = []
     for line in merged:
-        nt = normalize(line)
-        if len(nt) < 4:
             continue
-        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
         final.append(line)
     if not final:
-        return "Aucune cellule texte valide trouvée."
-    # -------------------------------------------------
-    # 6. Résultat numéroté
-    # -------------------------------------------------
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
@@ -151,12 +139,12 @@ def extract_second_column(image):
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu de la colonne 2"),
-    title="Extraction fiable de la colonne 2 (Désignation / Description)",
-    description=(
-        "Extraction robuste de la deuxième colonne des tableaux scannés "
-        "(Désignation, DESIGNATIONS, Description, Description des services)."
-    )
 )
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
 # -------------------------------------------------
+# OCR (CONFIG STABLE POUR HUGGING FACE)
 # -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
+    use_angle_cls=False,            # ⛔ désactivation orientation
+    show_log=False                  # silence logs
 )
 # -------------------------------------------------
+# Normalisation texte
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
     return " ".join(text.split())
 # -------------------------------------------------
+# Titres colonne 2
 # -------------------------------------------------
 COL_TITLES = {
     "designation",
 }
 # -------------------------------------------------
+# Mots à ignorer
 # -------------------------------------------------
 IGNORE_KEYWORDS = {
     "prix", "total", "ht", "htva", "tva",
 }
 # -------------------------------------------------
+# Extraction colonne 2
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
+    result = ocr.ocr(img, cls=False)
+    if not result or not result[0]:
         return "OCR : aucun texte détecté."
     blocks = []
+    for line in result[0]:
+        text = line[1][0].strip()
+        box = line[0]
+        if len(text) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
+        blocks.append((text, x, y))
     # -------------------------------------------------
+    # 1. Trouver le titre
     # -------------------------------------------------
+    col_x, title_y = None, None
     for text, x, y in blocks:
         if normalize(text) in COL_TITLES:
+            col_x, title_y = x, y
             break
     if col_x is None:
+        return "Titre de la colonne non détecté."
     # -------------------------------------------------
+    # 2. Filtrage par X + sous le titre
     # -------------------------------------------------
     column_blocks = [
         (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < 50 and y > title_y + 15
     ]
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 3. Fusion contrôlée
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
     for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        new_cell = (
+            last_y is None
+            or abs(y - last_y) > 35
+            or text[0].isupper()
+        )
+        if new_cell:
             if current:
                 merged.append(current.strip())
             current = text
         merged.append(current.strip())
     # -------------------------------------------------
+    # 4. Nettoyage final
     # -------------------------------------------------
     final = []
     for line in merged:
+        if not line[0].isupper():
             continue
+        if sum(c.isdigit() for c in line) > len(line) * 0.4:
             continue
         final.append(line)
     if not final:
+        return "Aucune cellule valide trouvée."
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu colonne 2"),
+    title="Extraction colonne Désignation / Description"
 )
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    ssr_mode=False
+)