Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

76bdf65

verified ·

1 Parent(s): ed975bc

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -60

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import unicodedata
 from paddleocr import PaddleOCR
 # -------------------------------------------------
-# OCR (compatible Hugging Face)
 # -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
@@ -12,7 +12,7 @@ ocr = PaddleOCR(
 )
 # -------------------------------------------------
-# Normalisation texte (casse + accents)
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
@@ -21,33 +21,19 @@ def normalize(text: str) -> str:
     return " ".join(text.split())
 # -------------------------------------------------
-# Titres valides de la colonne 2
-# -------------------------------------------------
-COL_TITLES = {
-    "designation",
-    "designations",
-    "description",
-    "description des services"
-}
-# -------------------------------------------------
-# Mots / lignes à ignorer
 # -------------------------------------------------
 IGNORE_KEYWORDS = {
-    "prix", "total", "ht", "htva", "tva",
-    "ttc", "general", "generale"
 }
-# -------------------------------------------------
-# Métadonnées à exclure (hors tableau)
-# -------------------------------------------------
-META_KEYWORDS = {
-    "dpo", "dao", "ref", "reference",
-    "date", "nme", ":"
-}
 # -------------------------------------------------
-# Fonction principale
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
@@ -60,8 +46,8 @@ def extract_second_column(image):
         return "OCR : aucun texte détecté."
     data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
     blocks = []
     for text, box in zip(texts, boxes):
@@ -74,61 +60,52 @@ def extract_second_column(image):
         blocks.append((t, x, y))
-    if len(blocks) < 5:
-        return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # 1. Détection du X de la colonne cible (par le titre)
     # -------------------------------------------------
-    col_x = None
     title_y = None
     for text, x, y in blocks:
-        if normalize(text) in COL_TITLES:
             col_x = x
             title_y = y
             break
     if col_x is None:
-        return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
-    # 2. Sélection des blocs de la colonne (SOUS le titre)
     # -------------------------------------------------
-    X_THRESHOLD = 45
     column_blocks = [
         (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD and y > title_y
     ]
     if not column_blocks:
-        return "Colonne détectée mais vide."
     # -------------------------------------------------
-    # 3. Tri vertical (haut → bas)
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 4. Fusion contrôlée des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
-    Y_THRESHOLD = 22
     for text, x, y in column_blocks:
         nt = normalize(text)
-        # Ignore lignes de totaux / prix
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        # Ignore métadonnées résiduelles
-        if any(k in nt for k in META_KEYWORDS):
-            continue
-        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
             current = text
@@ -141,40 +118,33 @@ def extract_second_column(image):
         merged.append(current.strip())
     # -------------------------------------------------
-    # 5. Nettoyage final (cellules texte métier uniquement)
     # -------------------------------------------------
     final = []
     for line in merged:
-        nt = normalize(line)
-        if len(nt) < 4:
             continue
-        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
         final.append(line)
     if not final:
-        return "Aucune cellule texte valide trouvée."
-    # -------------------------------------------------
-    # 6. Résultat numéroté
-    # -------------------------------------------------
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
-# Interface Gradio (Hugging Face)
 # -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
-    title="Extraction fiable de la colonne 2",
-    description=(
-        "Extraction robuste de la deuxième colonne des tableaux scannés "
-        "(Désignation, DESIGNATIONS, Description, Description des services)."
-    )
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 from paddleocr import PaddleOCR
 # -------------------------------------------------
+# OCR
 # -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
 )
 # -------------------------------------------------
+# Normalisation texte
 # -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
     return " ".join(text.split())
 # -------------------------------------------------
+# Paramètres spécifiques image "Description des services"
 # -------------------------------------------------
+COLUMN_TITLE = "description des services"
 IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva", "ttc",
+    "general", "generale"
 }
+X_THRESHOLD = 45
+Y_NEW_CELL = 32   # seuil volontairement élevé → empêche fusion abusive
 # -------------------------------------------------
+# Extraction colonne 2
 # -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "OCR : aucun texte détecté."
     data = result[0]
+    texts = data["rec_texts"]
+    boxes = data["dt_polys"]
     blocks = []
     for text, box in zip(texts, boxes):
         blocks.append((t, x, y))
     # -------------------------------------------------
+    # 1. Trouver le titre exact de la colonne
     # -------------------------------------------------
     title_y = None
+    col_x = None
     for text, x, y in blocks:
+        if normalize(text) == COLUMN_TITLE:
             col_x = x
             title_y = y
             break
     if col_x is None:
+        return "Titre 'Description des services' non détecté."
     # -------------------------------------------------
+    # 2. Garder uniquement le texte SOUS le titre
     # -------------------------------------------------
     column_blocks = [
         (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD and y > title_y + 15
     ]
     if not column_blocks:
+        return "Aucune cellule détectée sous la colonne."
     # -------------------------------------------------
+    # 3. Tri vertical
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 4. Fusion contrôlée (évite fusion Tuyau / Coude)
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
     for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        # nouvelle cellule
+        if last_y is None or abs(y - last_y) > Y_NEW_CELL:
             if current:
                 merged.append(current.strip())
             current = text
         merged.append(current.strip())
     # -------------------------------------------------
+    # 5. Nettoyage final
     # -------------------------------------------------
     final = []
     for line in merged:
+        if not line[0].isupper():
             continue
+        if sum(c.isdigit() for c in line) > len(line) * 0.3:
             continue
         final.append(line)
     if not final:
+        return "Aucune cellule valide trouvée."
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
+# Interface Gradio
 # -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
+    title="Extraction fiable – Colonne 2 (Description des services)",
+    description="Extraction robuste et ordonnée des cellules texte."
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)