Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

ed975bc

verified ·

1 Parent(s): daa9804

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -51

app.py CHANGED Viewed

@@ -3,26 +3,26 @@ import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
-# =================================================
-# OCR Paddle (HF compatible)
-# =================================================
 ocr = PaddleOCR(
     lang="fr",
     use_textline_orientation=True
 )
-# =================================================
 # Normalisation texte (casse + accents)
-# =================================================
 def normalize(text: str) -> str:
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
-# =================================================
-# Titres possibles de la colonne 2
-# =================================================
 COL_TITLES = {
     "designation",
     "designations",
@@ -30,34 +30,32 @@ COL_TITLES = {
     "description des services"
 }
-# =================================================
-# Mots à ignorer
-# =================================================
 IGNORE_KEYWORDS = {
-    "prix", "ht", "htva", "tva", "ttc",
-    "total", "generale", "general"
 }
-# =================================================
 # Fonction principale
-# =================================================
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
-    # -------------------------------------------------
-    # Rotation automatique si image couchée
-    # -------------------------------------------------
-    h, w = img.shape[:2]
-    if w > h:
-        img = np.rot90(img, 1)
-    # -------------------------------------------------
-    # OCR
-    # -------------------------------------------------
     result = ocr.predict(img)
     if not result:
         return "OCR : aucun texte détecté."
@@ -76,56 +74,60 @@ def extract_second_column(image):
         blocks.append((t, x, y))
-    if len(blocks) < 8:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # Détection du X de la colonne 2 via le titre
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
-        nt = normalize(text)
-        for title in COL_TITLES:
-            if title in nt:
-                col_x = x
-                break
-        if col_x is not None:
             break
     if col_x is None:
         return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
-    # Sélection des blocs de la colonne
     # -------------------------------------------------
-    X_THRESHOLD = 55
     column_blocks = [
         (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD
     ]
     if not column_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
-    # Tri vertical
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # Fusion intelligente des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
-    Y_THRESHOLD = 28
     for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
@@ -139,19 +141,16 @@ def extract_second_column(image):
         merged.append(current.strip())
     # -------------------------------------------------
-    # Nettoyage final
     # -------------------------------------------------
     final = []
     for line in merged:
         nt = normalize(line)
-        if nt in COL_TITLES:
-            continue
-        if len(nt) < 5:
             continue
-        if sum(c.isdigit() for c in line) > len(line) / 3:
-            continue
-        if not line[0].isupper():
             continue
         final.append(line)
@@ -159,18 +158,21 @@ def extract_second_column(image):
     if not final:
         return "Aucune cellule texte valide trouvée."
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
-# =================================================
-# Interface Gradio
-# =================================================
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
     title="Extraction fiable de la colonne 2",
     description=(
-        "Extraction adaptative de la deuxième colonne "
         "(Désignation, DESIGNATIONS, Description, Description des services)."
     )
 )

 import unicodedata
 from paddleocr import PaddleOCR
+# -------------------------------------------------
+# OCR (compatible Hugging Face)
+# -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
     use_textline_orientation=True
 )
+# -------------------------------------------------
 # Normalisation texte (casse + accents)
+# -------------------------------------------------
 def normalize(text: str) -> str:
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
+# -------------------------------------------------
+# Titres valides de la colonne 2
+# -------------------------------------------------
 COL_TITLES = {
     "designation",
     "designations",
     "description des services"
 }
+# -------------------------------------------------
+# Mots / lignes à ignorer
+# -------------------------------------------------
 IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva",
+    "ttc", "general", "generale"
 }
+# -------------------------------------------------
+# Métadonnées à exclure (hors tableau)
+# -------------------------------------------------
+META_KEYWORDS = {
+    "dpo", "dao", "ref", "reference",
+    "date", "nme", ":"
+}
+# -------------------------------------------------
 # Fonction principale
+# -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
     if not result:
         return "OCR : aucun texte détecté."
         blocks.append((t, x, y))
+    if len(blocks) < 5:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
+    # 1. Détection du X de la colonne cible (par le titre)
     # -------------------------------------------------
     col_x = None
+    title_y = None
     for text, x, y in blocks:
+        if normalize(text) in COL_TITLES:
+            col_x = x
+            title_y = y
             break
     if col_x is None:
         return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
+    # 2. Sélection des blocs de la colonne (SOUS le titre)
     # -------------------------------------------------
+    X_THRESHOLD = 45
     column_blocks = [
         (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD and y > title_y
     ]
     if not column_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
+    # 3. Tri vertical (haut → bas)
     # -------------------------------------------------
     column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 4. Fusion contrôlée des lignes OCR
     # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
+    Y_THRESHOLD = 22
     for text, x, y in column_blocks:
         nt = normalize(text)
+        # Ignore lignes de totaux / prix
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        # Ignore métadonnées résiduelles
+        if any(k in nt for k in META_KEYWORDS):
+            continue
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
         merged.append(current.strip())
     # -------------------------------------------------
+    # 5. Nettoyage final (cellules texte métier uniquement)
     # -------------------------------------------------
     final = []
     for line in merged:
         nt = normalize(line)
+        if len(nt) < 4:
             continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
         final.append(line)
     if not final:
         return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 6. Résultat numéroté
+    # -------------------------------------------------
     return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
+# -------------------------------------------------
+# Interface Gradio (Hugging Face)
+# -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Contenu de la colonne 2"),
     title="Extraction fiable de la colonne 2",
     description=(
+        "Extraction robuste de la deuxième colonne des tableaux scannés "
         "(Désignation, DESIGNATIONS, Description, Description des services)."
     )
 )