Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

ae7976c

verified ·

1 Parent(s): 76bdf65

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -33

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
 # -------------------------------------------------
 # OCR
@@ -12,7 +13,7 @@ ocr = PaddleOCR(
 )
 # -------------------------------------------------
-# Normalisation texte
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
@@ -21,19 +22,25 @@ def normalize(text: str) -> str:
     return " ".join(text.split())
 # -------------------------------------------------
-# Paramètres spécifiques image "Description des services"
 # -------------------------------------------------
-COLUMN_TITLE = "description des services"
-IGNORE_KEYWORDS = {
-    "prix", "total", "ht", "htva", "tva", "ttc",
-    "general", "generale"
 }
-X_THRESHOLD = 45
-Y_NEW_CELL = 32   # seuil volontairement élevé → empêche fusion abusive
 # -------------------------------------------------
-# Extraction colonne 2
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
@@ -46,8 +53,8 @@ def extract_second_column(image):
         return "OCR : aucun texte détecté."
     data = result[0]
-    texts = data["rec_texts"]
-    boxes = data["dt_polys"]
     blocks = []
     for text, box in zip(texts, boxes):
@@ -60,43 +67,45 @@ def extract_second_column(image):
         blocks.append((t, x, y))
     # -------------------------------------------------
-    # 1. Trouver le titre exact de la colonne
     # -------------------------------------------------
-    title_y = None
     col_x = None
     for text, x, y in blocks:
-        if normalize(text) == COLUMN_TITLE:
             col_x = x
-            title_y = y
             break
     if col_x is None:
-        return "Titre 'Description des services' non détecté."
     # -------------------------------------------------
-    # 2. Garder uniquement le texte SOUS le titre
     # -------------------------------------------------
     column_blocks = [
         (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD and y > title_y + 15
     ]
     if not column_blocks:
-        return "Aucune cellule détectée sous la colonne."
     # -------------------------------------------------
-    # 3. Tri vertical
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 4. Fusion contrôlée (évite fusion Tuyau / Coude)
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
     for text, x, y in column_blocks:
         nt = normalize(text)
@@ -104,8 +113,7 @@ def extract_second_column(image):
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        # nouvelle cellule
-        if last_y is None or abs(y - last_y) > Y_NEW_CELL:
             if current:
                 merged.append(current.strip())
             current = text
@@ -118,24 +126,25 @@ def extract_second_column(image):
         merged.append(current.strip())
     # -------------------------------------------------
-    # 5. Nettoyage final
     # -------------------------------------------------
     final = []
     for line in merged:
-        if not line[0].isupper():
             continue
-        if sum(c.isdigit() for c in line) > len(line) * 0.3:
             continue
         final.append(line)
     if not final:
-        return "Aucune cellule valide trouvée."
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
 # Interface Gradio
 # -------------------------------------------------
@@ -143,8 +152,11 @@ demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
-    title="Extraction fiable – Colonne 2 (Description des services)",
-    description="Extraction robuste et ordonnée des cellules texte."
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
+from sklearn.cluster import KMeans
 # -------------------------------------------------
 # OCR
 )
 # -------------------------------------------------
+# Normalisation texte (casse + accents)
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
     return " ".join(text.split())
 # -------------------------------------------------
+# Titres valides de la colonne 2
 # -------------------------------------------------
+COL_TITLES = {
+    "designation",
+    "designations",
+    "description",
+    "description des services"
 }
+# -------------------------------------------------
+# Mots / lignes à ignorer
+# -------------------------------------------------
+IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva",
+    "ttc", "general", "generale"
+}
 # -------------------------------------------------
+# Fonction principale
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "OCR : aucun texte détecté."
     data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
     blocks = []
     for text, box in zip(texts, boxes):
         blocks.append((t, x, y))
+    if len(blocks) < 5:
+        return "Pas assez de texte exploitable."
     # -------------------------------------------------
+    # 1. Détection du X de la colonne cible via son titre
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
+        if normalize(text) in COL_TITLES:
             col_x = x
             break
     if col_x is None:
+        return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
+    # 2. Sélection des blocs proches du X détecté
     # -------------------------------------------------
+    X_THRESHOLD = 45
     column_blocks = [
         (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD
     ]
     if not column_blocks:
+        return "Colonne détectée mais vide."
     # -------------------------------------------------
+    # 3. Tri vertical (haut → bas)
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 4. Fusion intelligente des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
+    Y_THRESHOLD = 22
     for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
             current = text
         merged.append(current.strip())
     # -------------------------------------------------
+    # 5. Nettoyage final (cellules texte uniquement)
     # -------------------------------------------------
     final = []
     for line in merged:
+        nt = normalize(line)
+        if len(nt) < 4:
             continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
         final.append(line)
     if not final:
+        return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 6. Résultat numéroté
+    # -------------------------------------------------
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
 # Interface Gradio
 # -------------------------------------------------
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
+    title="Extraction fiable de la colonne 2 (Désignation / Description)",
+    description=(
+        "Extraction robuste de la deuxième colonne des tableaux scannés "
+        "(Désignation, DESIGNATIONS, Description, Description des services)."
+    )
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)