Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 22, 2025

Commit

fe12926

verified ·

1 Parent(s): d3dbd12

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -112

app.py CHANGED Viewed

@@ -1,147 +1,114 @@
 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
-import re
-# -------------------------------------------------
-# OCR
-# -------------------------------------------------
-ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
-# -------------------------------------------------
-# RÈGLES MÉTIER
-# -------------------------------------------------
-def is_title(text):
-    t = text.upper()
-    return any(k in t for k in [
-        "CADRE DE DEVIS",
-        "LOT",
-        "AXE",
-        "PRIX TOTAL",
-        "TVA",
-        "TTC"
-    ])
-def is_f_start(text):
-    # F majuscule = début cellule
-    # SAUF F6
-    return text.startswith("F") and not text.startswith("F6")
-def is_f6(text):
-    return text.startswith("F6")
-def is_continuation(text):
-    t = text.lower().strip()
-    return (
-        t.startswith("avec")
-        or t.startswith("et ")
-        or t.startswith("y compris")
-        or t.startswith("compris")
-        or t.startswith("basse")
-        or t.startswith("franchissable")
-        or t.startswith("pour ")
-        or t.startswith("f6")
-    )
-def looks_like_text(text):
-    return len(text) >= 4 and not re.match(r"^\d+$", text)
-# -------------------------------------------------
-# EXTRACTION PRINCIPALE
-# -------------------------------------------------
-def extract_designations(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
-    result = ocr.predict(img)[0]
-    texts = result["rec_texts"]
-    boxes = result["dt_polys"]
-    # Tri vertical
-    lines = []
     for text, box in zip(texts, boxes):
         y = np.mean([p[1] for p in box])
-        lines.append((y, text.strip()))
-    lines.sort(key=lambda x: x[0])
-    # -----------------------------
-    # APRÈS "DESIGNATIONS"
-    # -----------------------------
-    started = False
-    cleaned = []
-    for _, text in lines:
-        if text.upper() == "DESIGNATIONS":
-            started = True
-            continue
-        if not started:
-            continue
-        if is_title(text):
-            continue
-        if looks_like_text(text):
-            cleaned.append(text)
-    # -----------------------------
-    # CONSTRUCTION DES CELLULES
-    # -----------------------------
-    cells = []
-    current = ""
-    for text in cleaned:
-        # F MAJUSCULE (≠ F6) → NOUVELLE CELLULE
-        if is_f_start(text):
-            if current:
-                cells.append(current.strip())
-            current = text
-            continue
-        # F6 → CONTINUATION FORCÉE
-        if is_f6(text):
-            current += " " + text
-            continue
-        if not current:
-            current = text
-            continue
-        if is_continuation(text):
-            current += " " + text
         else:
-            # Nouvelle cellule logique (changement fort)
-            if text[0].isupper() and len(current) > 25:
-                cells.append(current.strip())
-                current = text
-            else:
-                current += " " + text
     if current:
-        cells.append(current.strip())
-    # -----------------------------
-    # SORTIE
-    # -----------------------------
-    cells = cells[:9]
-    if not cells:
-        return "Aucune désignation détectée."
-    return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
-# -------------------------------------------------
-# INTERFACE GRADIO
-# -------------------------------------------------
 demo = gr.Interface(
-    fn=extract_designations,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Colonne DESIGNATIONS (V7)"),
     title="Extraction fiable de la colonne DESIGNATIONS",
-    description="Règle F majuscule respectée – F6 = continuation (cellule 7)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
+from sklearn.cluster import KMeans
+ocr = PaddleOCR(
+    use_textline_orientation=True,
+    lang="fr"
+)
+HEADER_EXACT = "DESIGNATIONS"
+def extract_column2_9_lines(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
+    result = ocr.predict(img)
+    if not result:
+        return "Aucun texte détecté."
+    data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
+    elements = []
     for text, box in zip(texts, boxes):
+        text = text.strip()
+        if len(text) < 2:
+            continue
+        x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
+        elements.append((x, y, text))
+    if len(elements) < 5:
+        return "Pas assez de données OCR."
+    # --- CLUSTER COLONNES ---
+    X = np.array([[e[0]] for e in elements])
+    kmeans = KMeans(n_clusters=min(7, len(elements)//6 + 2), random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X)
+    columns = {}
+    for (x, y, t), lbl in zip(elements, labels):
+        columns.setdefault(lbl, []).append((x, y, t))
+    # --- COLONNE DESCRIPTION = max texte non numérique ---
+    def score(col):
+        return sum(len(t) for _,_,t in col if not any(c.isdigit() for c in t))
+    desc_col = max(columns.values(), key=score)
+    desc_col.sort(key=lambda e: e[1])  # top -> bottom
+    # --- LOCALISER L’EN-TÊTE ---
+    header_index = None
+    for i, (_, _, t) in enumerate(desc_col):
+        if t.upper() == HEADER_EXACT:
+            header_index = i
+            break
+    if header_index is None:
+        start_index = 0
+    else:
+        start_index = header_index + 1
+    content = desc_col[start_index:]
+    # --- SEUIL ADAPTATIF ---
+    ys = [y for _,y,_ in content]
+    Y_THRESHOLD = max(22, np.median(np.diff(sorted(ys))) * 1.2) if len(ys) > 1 else 30
+    # --- FUSION ---
+    lines = []
+    current = ""
+    last_y = None
+    for _, y, text in content:
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
+            if current:
+                lines.append(current.strip())
+            current = text
         else:
+            current += " " + text
+        last_y = y
     if current:
+        lines.append(current.strip())
+    # --- NETTOYAGE ---
+    final = []
+    for i, l in enumerate(lines):
+        if i == 0:
+            final.append(l)  # Toujours garder la 1ère vraie ligne
+            continue
+        if len(l) < 5:
+            continue
+        if sum(c.isdigit() for c in l) > len(l)/2:
+            continue
+        final.append(l)
+    final = final[:9]
+    return "\n".join([f"{i+1}. {l}" for i,l in enumerate(final)])
+# --- GRADIO ---
 demo = gr.Interface(
+    fn=extract_column2_9_lines,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
     title="Extraction fiable de la colonne DESIGNATIONS",
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)