Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

ecf1403

verified ·

1 Parent(s): 2728550

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -68

app.py CHANGED Viewed

@@ -3,117 +3,134 @@ import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
-# -------------------------------------------------
-# OCR
-# -------------------------------------------------
-ocr = PaddleOCR(lang="fr", use_textline_orientation=True)
-# -------------------------------------------------
-# Normalisation
-# -------------------------------------------------
-def normalize(text):
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
-# -------------------------------------------------
-# Titres possibles colonne 2
-# -------------------------------------------------
-COL_TITLES = [
     "designation",
     "designations",
     "description",
     "description des services"
-]
-# -------------------------------------------------
-# Lignes à ignorer
-# -------------------------------------------------
-IGNORE_KEYWORDS = [
-    "prix", "total", "ht", "htva", "tva", "ttc",
-    "general", "generale"
-]
-# -------------------------------------------------
-# Détection début cellule (règle métier)
-# -------------------------------------------------
-def is_new_cell(text):
-    return text and text[0].isupper()
-# -------------------------------------------------
 # Fonction principale
-# -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
-    result = ocr.predict(img)
-    if not result or not result[0]:
         return "OCR : aucun texte détecté."
-    texts = result[0]["rec_texts"]
-    boxes = result[0]["dt_polys"]
     blocks = []
     for text, box in zip(texts, boxes):
-        text = text.strip()
-        if len(text) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
-        blocks.append((text, x, y))
-    if len(blocks) < 5:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # 1. Détection X colonne 2 par le TITRE (robuste)
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
         nt = normalize(text)
-        if any(nt.startswith(t) for t in COL_TITLES):
-            col_x = x
             break
     if col_x is None:
-        return "Titre de la colonne 2 non détecté."
     # -------------------------------------------------
-    # 2. Sélection blocs proches de X
     # -------------------------------------------------
-    X_THRESHOLD = 60
-    col_blocks = [(t, x, y) for t, x, y in blocks if abs(x - col_x) < X_THRESHOLD]
-    if not col_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
-    # 3. Tri vertical
     # -------------------------------------------------
-    col_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 4. Reconstruction cellules (RÈGLE MAJUSCULE)
     # -------------------------------------------------
-    cells = []
     current = ""
     last_y = None
     Y_THRESHOLD = 28
-    for text, x, y in col_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        if current == "" or is_new_cell(text) or (last_y and abs(y - last_y) > Y_THRESHOLD):
             if current:
-                cells.append(current.strip())
             current = text
         else:
             current += " " + text
@@ -121,34 +138,54 @@ def extract_second_column(image):
         last_y = y
     if current:
-        cells.append(current.strip())
     # -------------------------------------------------
-    # 5. Nettoyage final
     # -------------------------------------------------
     final = []
-    for c in cells:
-        nt = normalize(c)
-        if len(nt) < 4:
             continue
-        if sum(ch.isdigit() for ch in c) > len(c) * 0.6:
             continue
-        final.append(c)
     if not final:
-        return "Aucune cellule valide trouvée."
-    return "\n".join(f"{i+1}. {c}" for i, c in enumerate(final))
-# -------------------------------------------------
-# Gradio
-# -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu colonne 2", lines=15),
     title="Extraction fiable de la colonne 2",
-    description="Extraction robuste de la colonne 2 (Désignation / Description)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import unicodedata
 from paddleocr import PaddleOCR
+# =================================================
+# OCR Paddle
+# =================================================
+ocr = PaddleOCR(
+    lang="fr",
+    use_textline_orientation=True,
+    show_log=False
+)
+# =================================================
+# Normalisation texte (casse + accents)
+# =================================================
+def normalize(text: str) -> str:
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
+# =================================================
+# Titres possibles de la colonne 2
+# (casse ignorée automatiquement)
+# =================================================
+COL_TITLES = {
     "designation",
     "designations",
     "description",
     "description des services"
+}
+# =================================================
+# Mots à ignorer absolument
+# =================================================
+IGNORE_KEYWORDS = {
+    "prix", "ht", "htva", "tva", "ttc",
+    "total", "generale", "general"
+}
+# =================================================
 # Fonction principale
+# =================================================
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
+    # -------------------------------------------------
+    # 0. Rotation automatique si image couchée
+    # -------------------------------------------------
+    h, w = img.shape[:2]
+    if w > h:
+        img = np.rot90(img, 1)
+    # -------------------------------------------------
+    # 1. OCR
+    # -------------------------------------------------
+    result = ocr.predict(img)
+    if not result:
         return "OCR : aucun texte détecté."
+    data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
     blocks = []
     for text, box in zip(texts, boxes):
+        t = text.strip()
+        if len(t) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
+        blocks.append((t, x, y))
+    if len(blocks) < 8:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
+    # 2. Détection robuste du X de la colonne 2
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
         nt = normalize(text)
+        for title in COL_TITLES:
+            if title in nt:
+                col_x = x
+                break
+        if col_x is not None:
             break
     if col_x is None:
+        return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
+    # 3. Sélection des blocs de la colonne 2
     # -------------------------------------------------
+    X_THRESHOLD = 55
+    column_blocks = [
+        (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD
+    ]
+    if not column_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
+    # 4. Tri vertical (haut → bas)
     # -------------------------------------------------
+    column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 5. Fusion intelligente des lignes OCR
     # -------------------------------------------------
+    merged = []
     current = ""
     last_y = None
     Y_THRESHOLD = 28
+    for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
+                merged.append(current.strip())
             current = text
         else:
             current += " " + text
         last_y = y
     if current:
+        merged.append(current.strip())
     # -------------------------------------------------
+    # 6. Nettoyage final des cellules texte
     # -------------------------------------------------
     final = []
+    for line in merged:
+        nt = normalize(line)
+        # ignorer le titre de colonne
+        if nt in COL_TITLES:
+            continue
+        # longueur minimale
+        if len(nt) < 5:
+            continue
+        # ignorer lignes trop numériques
+        if sum(c.isdigit() for c in line) > len(line) / 3:
             continue
+        # règle métier : commence par majuscule
+        if not line[0].isupper():
             continue
+        final.append(line)
     if not final:
+        return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 7. Résultat numéroté
+    # -------------------------------------------------
+    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
+# =================================================
+# Interface Gradio (Hugging Face)
+# =================================================
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu de la colonne 2"),
     title="Extraction fiable de la colonne 2",
+    description=(
+        "Extraction automatique et adaptative de la deuxième colonne "
+        "(Désignation, DESIGNATIONS, Description, Description des services) "
+        "à partir de tableaux scannés."
+    )
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)