Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 22, 2025

Commit

d539c06

verified ·

1 Parent(s): 1032de6

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -91

app.py CHANGED Viewed

@@ -2,138 +2,105 @@ import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
 # -------------------------------------------------
-# OCR
 # -------------------------------------------------
-ocr = PaddleOCR(
-    use_textline_orientation=True,
-    lang="fr"
-)
 # -------------------------------------------------
-# EXTRACTION DESIGNATIONS
 # -------------------------------------------------
-def extract_column2_9_lines(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
-    if not result:
-        return "OCR exécuté mais aucun texte détecté."
     data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
-    # -------------------------------------------------
-    # 1. COLLECTE OCR
-    # -------------------------------------------------
-    elements = []
     for text, box in zip(texts, boxes):
         text = text.strip()
-        if len(text) < 2:
-            continue
-        x_center = np.mean([p[0] for p in box])
-        y_center = np.mean([p[1] for p in box])
-        elements.append((x_center, y_center, text))
-    if len(elements) < 6:
-        return "Pas assez de texte exploitable."
-    # -------------------------------------------------
-    # 2. CLUSTERING DES COLONNES
-    # -------------------------------------------------
-    X = np.array([[e[0]] for e in elements])
-    n_cols = min(7, max(3, len(elements) // 6))
-    kmeans = KMeans(n_clusters=n_cols, random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X)
-    columns = {}
-    for (x, y, text), label in zip(elements, labels):
-        columns.setdefault(label, []).append((x, y, text))
-    # -------------------------------------------------
-    # 3. CHOIX COLONNE DESIGNATIONS
-    # -------------------------------------------------
-    def column_score(col):
-        return sum(
-            len(t) for _, _, t in col
-            if not any(c.isdigit() for c in t)
-        )
-    col = max(columns.values(), key=column_score)
-    col.sort(key=lambda e: e[1])  # top → bottom
-    # -------------------------------------------------
-    # 4. SUPPRESSION DE L’EN-TÊTE
-    # -------------------------------------------------
-    cleaned = []
-    header_removed = False
-    for x, y, text in col:
-        if not header_removed and text.upper().strip() == "DESIGNATIONS":
-            header_removed = True
             continue
-        cleaned.append((y, text))
     # -------------------------------------------------
-    # 5. FUSION CELLULES (LOGIQUE AMÉLIORÉE)
     # -------------------------------------------------
-    merged = []
     current = ""
-    last_y = None
-    for y, text in cleaned:
-        new_cell = False
-        if last_y is None:
-            new_cell = True
-        elif abs(y - last_y) > 35 and len(current) > 30 and text[0].isupper():
-            new_cell = True
-        if new_cell:
-            if current:
-                merged.append(current.strip())
             current = text
         else:
             current += " " + text
-        last_y = y
     if current:
-        merged.append(current.strip())
-    # -------------------------------------------------
-    # 6. NETTOYAGE FINAL
-    # -------------------------------------------------
-    final = []
-    for line in merged:
-        if sum(c.isdigit() for c in line) > len(line) * 0.45:
-            continue
-        final.append(line)
-    final = final[:9]
-    if not final:
-        return "Aucune ligne exploitable détectée."
-    return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final)])
 # -------------------------------------------------
 # INTERFACE
 # -------------------------------------------------
 demo = gr.Interface(
-    fn=extract_column2_9_lines,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Colonne DESIGNATIONS (9 lignes)"),
     title="Extraction fiable de la colonne DESIGNATIONS",
-    description="Optimisé pour devis et bordereaux photographiés"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
+import re
+ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
 # -------------------------------------------------
+# OUTILS TEXTE
 # -------------------------------------------------
+def is_continuation(text):
+    t = text.lower().strip()
+    return (
+        t.startswith("et ")
+        or t.startswith("avec ")
+        or t.startswith("y compris")
+        or t.startswith("compr")
+    )
+def has_too_many_digits(text):
+    return sum(c.isdigit() for c in text) > len(text) * 0.4
+def looks_like_designation(text):
+    if len(text) < 10:
+        return False
+    if has_too_many_digits(text):
+        return False
+    if re.match(r"^(m2|m3|ml|u|ff)\b", text.lower()):
+        return False
+    return True
 # -------------------------------------------------
+# EXTRACTION
 # -------------------------------------------------
+def extract_designations(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
     data = result[0]
+    texts = data["rec_texts"]
+    boxes = data["dt_polys"]
+    lines = []
     for text, box in zip(texts, boxes):
         text = text.strip()
+        y = np.mean([p[1] for p in box])
+        lines.append((y, text))
+    # Tri vertical
+    lines.sort(key=lambda x: x[0])
+    # Suppression en-tête
+    filtered = []
+    for y, text in lines:
+        if text.upper().strip() == "DESIGNATIONS":
             continue
+        filtered.append(text)
     # -------------------------------------------------
+    # FUSION INTELLIGENTE
     # -------------------------------------------------
+    cells = []
     current = ""
+    for text in filtered:
+        if not looks_like_designation(text):
+            continue
+        if not current:
+            current = text
+            continue
+        if is_continuation(text):
+            current += " " + text
+        elif text[0].isupper() and len(text) > 20:
+            cells.append(current.strip())
             current = text
         else:
             current += " " + text
     if current:
+        cells.append(current.strip())
+    cells = cells[:9]
+    if not cells:
+        return "Aucune désignation détectée."
+    return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
 # -------------------------------------------------
 # INTERFACE
 # -------------------------------------------------
 demo = gr.Interface(
+    fn=extract_designations,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
     title="Extraction fiable de la colonne DESIGNATIONS",
+    description="Approche textuelle robuste pour devis et bordereaux"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)