Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 23, 2025

Commit

2728550

verified ·

1 Parent(s): 9becf7c

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -59

app.py CHANGED Viewed

@@ -2,42 +2,44 @@ import gradio as gr
 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
-from sklearn.cluster import KMeans
 # -------------------------------------------------
 # OCR
 # -------------------------------------------------
-ocr = PaddleOCR(
-    lang="fr",
-    use_textline_orientation=True
-)
 # -------------------------------------------------
-# Normalisation texte (casse + accents)
 # -------------------------------------------------
-def normalize(text: str) -> str:
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
 # -------------------------------------------------
-# Titres valides de la colonne 2
 # -------------------------------------------------
-COL_TITLES = {
     "designation",
     "designations",
     "description",
     "description des services"
-}
 # -------------------------------------------------
-# Mots / lignes à ignorer
 # -------------------------------------------------
-IGNORE_KEYWORDS = {
-    "prix", "total", "ht", "htva", "tva",
-    "ttc", "general", "generale"
-}
 # -------------------------------------------------
 # Fonction principale
@@ -49,73 +51,69 @@ def extract_second_column(image):
     img = np.array(image)
     result = ocr.predict(img)
-    if not result:
         return "OCR : aucun texte détecté."
-    data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
     blocks = []
     for text, box in zip(texts, boxes):
-        t = text.strip()
-        if len(t) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
-        blocks.append((t, x, y))
     if len(blocks) < 5:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
-    # 1. Détection du X de la colonne cible via son titre
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
-        if normalize(text) in COL_TITLES:
             col_x = x
             break
     if col_x is None:
-        return "Titre de la colonne cible non détecté."
     # -------------------------------------------------
-    # 2. Sélection des blocs proches du X détecté
     # -------------------------------------------------
-    X_THRESHOLD = 45
-    column_blocks = [
-        (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD
-    ]
-    if not column_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
-    # 3. Tri vertical (haut → bas)
     # -------------------------------------------------
-    column_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
-    # 4. Fusion intelligente des lignes OCR
     # -------------------------------------------------
-    merged = []
     current = ""
     last_y = None
-    Y_THRESHOLD = 22
-    for text, x, y in column_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
-                merged.append(current.strip())
             current = text
         else:
             current += " " + text
@@ -123,40 +121,34 @@ def extract_second_column(image):
         last_y = y
     if current:
-        merged.append(current.strip())
     # -------------------------------------------------
-    # 5. Nettoyage final (cellules texte uniquement)
     # -------------------------------------------------
     final = []
-    for line in merged:
-        nt = normalize(line)
         if len(nt) < 4:
             continue
-        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
-        final.append(line)
     if not final:
-        return "Aucune cellule texte valide trouvée."
-    # -------------------------------------------------
-    # 6. Résultat numéroté
-    # -------------------------------------------------
-    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 # -------------------------------------------------
-# Interface Gradio
 # -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu de la colonne 2"),
-    title="Extraction fiable de la colonne 2 (Désignation / Description)",
-    description=(
-        "Extraction robuste de la deuxième colonne des tableaux scannés "
-        "(Désignation, DESIGNATIONS, Description, Description des services)."
-    )
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 import unicodedata
 from paddleocr import PaddleOCR
 # -------------------------------------------------
 # OCR
 # -------------------------------------------------
+ocr = PaddleOCR(lang="fr", use_textline_orientation=True)
 # -------------------------------------------------
+# Normalisation
 # -------------------------------------------------
+def normalize(text):
     text = text.lower()
     text = unicodedata.normalize("NFD", text)
     text = "".join(c for c in text if unicodedata.category(c) != "Mn")
     return " ".join(text.split())
 # -------------------------------------------------
+# Titres possibles colonne 2
 # -------------------------------------------------
+COL_TITLES = [
     "designation",
     "designations",
     "description",
     "description des services"
+]
+# -------------------------------------------------
+# Lignes à ignorer
+# -------------------------------------------------
+IGNORE_KEYWORDS = [
+    "prix", "total", "ht", "htva", "tva", "ttc",
+    "general", "generale"
+]
 # -------------------------------------------------
+# Détection début cellule (règle métier)
 # -------------------------------------------------
+def is_new_cell(text):
+    return text and text[0].isupper()
 # -------------------------------------------------
 # Fonction principale
     img = np.array(image)
     result = ocr.predict(img)
+    if not result or not result[0]:
         return "OCR : aucun texte détecté."
+    texts = result[0]["rec_texts"]
+    boxes = result[0]["dt_polys"]
     blocks = []
     for text, box in zip(texts, boxes):
+        text = text.strip()
+        if len(text) < 2:
             continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
+        blocks.append((text, x, y))
     if len(blocks) < 5:
         return "Pas assez de texte exploitable."
     # -------------------------------------------------
+    # 1. Détection X colonne 2 par le TITRE (robuste)
     # -------------------------------------------------
     col_x = None
     for text, x, y in blocks:
+        nt = normalize(text)
+        if any(nt.startswith(t) for t in COL_TITLES):
             col_x = x
             break
     if col_x is None:
+        return "Titre de la colonne 2 non détecté."
     # -------------------------------------------------
+    # 2. Sélection blocs proches de X
     # -------------------------------------------------
+    X_THRESHOLD = 60
+    col_blocks = [(t, x, y) for t, x, y in blocks if abs(x - col_x) < X_THRESHOLD]
+    if not col_blocks:
         return "Colonne détectée mais vide."
     # -------------------------------------------------
+    # 3. Tri vertical
     # -------------------------------------------------
+    col_blocks.sort(key=lambda e: e[2])
     # -------------------------------------------------
+    # 4. Reconstruction cellules (RÈGLE MAJUSCULE)
     # -------------------------------------------------
+    cells = []
     current = ""
     last_y = None
+    Y_THRESHOLD = 28
+    for text, x, y in col_blocks:
         nt = normalize(text)
         if any(k in nt for k in IGNORE_KEYWORDS):
             continue
+        if current == "" or is_new_cell(text) or (last_y and abs(y - last_y) > Y_THRESHOLD):
             if current:
+                cells.append(current.strip())
             current = text
         else:
             current += " " + text
         last_y = y
     if current:
+        cells.append(current.strip())
     # -------------------------------------------------
+    # 5. Nettoyage final
     # -------------------------------------------------
     final = []
+    for c in cells:
+        nt = normalize(c)
         if len(nt) < 4:
             continue
+        if sum(ch.isdigit() for ch in c) > len(c) * 0.6:
             continue
+        final.append(c)
     if not final:
+        return "Aucune cellule valide trouvée."
+    return "\n".join(f"{i+1}. {c}" for i, c in enumerate(final))
 # -------------------------------------------------
+# Gradio
 # -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu colonne 2", lines=15),
+    title="Extraction fiable de la colonne 2",
+    description="Extraction robuste de la colonne 2 (Désignation / Description)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)