Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 23, 2025

Commit

9becf7c

verified ·

1 Parent(s): 121216c

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -137

app.py CHANGED Viewed

@@ -1,150 +1,162 @@
 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
-from unidecode import unidecode
-# -----------------------------
-# CONFIG
-# -----------------------------
-TARGET_HEADERS = [
     "designation",
     "designations",
     "description",
     "description des services"
-]
-BLACKLIST = [
-    "prix htva", "prix tva", "prix total",
-    "prix generale", "total general", "tva"
-]
-ocr = PaddleOCR(use_angle_cls=True, lang="fr")
-# -----------------------------
-# UTILS
-# -----------------------------
-def norm(txt):
-    return unidecode(txt.lower().strip())
-def is_blacklisted(txt):
-    t = norm(txt)
-    return any(b in t for b in BLACKLIST)
-def starts_cell(txt):
-    return txt and txt[0].isupper()
-# -----------------------------
-# OCR
-# -----------------------------
-def ocr_extract(image):
-    result = ocr.ocr(image, cls=True)
-    words = []
-    for line in result[0]:
-        box = line[0]
-        text = line[1][0]
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
-        words.append({
-            "text": text.strip(),
-            "x": x,
-            "y": y
-        })
-    return words
-# -----------------------------
-# GROUP ROWS
-# -----------------------------
-def group_rows(words, tol=18):
-    words = sorted(words, key=lambda w: w["y"])
-    rows = []
-    for w in words:
-        added = False
-        for r in rows:
-            if abs(r[0]["y"] - w["y"]) < tol:
-                r.append(w)
-                added = True
-                break
-        if not added:
-            rows.append([w])
-    return rows
-# -----------------------------
-# COLUMN 2 DETECTION
-# -----------------------------
-def get_col2_x(rows):
-    xs = sorted(w["x"] for r in rows for w in r)
-    cols = []
-    for x in xs:
-        if not cols or abs(cols[-1][0] - x) > 45:
-            cols.append([x])
-        else:
-            cols[-1].append(x)
-    centers = [np.mean(c) for c in cols]
-    return centers[1]  # colonne 2
-# -----------------------------
-# CELL RECONSTRUCTION
-# -----------------------------
-def extract_cells(rows, col2_x):
-    ordered = []
-    for r in rows:
-        candidate = min(r, key=lambda w: abs(w["x"] - col2_x))
-        if abs(candidate["x"] - col2_x) < 65:
-            ordered.append(candidate)
-    ordered = sorted(ordered, key=lambda w: w["y"])
-    cells = []
-    buffer = ""
-    for o in ordered:
-        txt = o["text"]
-        if starts_cell(txt):
-            if buffer:
-                cells.append(buffer.strip())
-            buffer = txt
         else:
-            buffer += " " + txt
-    if buffer:
-        cells.append(buffer.strip())
-    return cells
-# -----------------------------
-# CLEAN
-# -----------------------------
-def clean_cells(cells):
-    return [c for c in cells if not is_blacklisted(c)]
-# -----------------------------
-# PIPELINE
-# -----------------------------
-def extract_column(image):
-    words = ocr_extract(image)
-    rows = group_rows(words)
-    col2_x = get_col2_x(rows)
-    cells = extract_cells(rows, col2_x)
-    cells = clean_cells(cells)
-    return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
-# -----------------------------
-# GRADIO
-# -----------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("## Extraction fiable de la colonne 2 (PaddleOCR)")
-    img = gr.Image(type="filepath", label="Image du tableau")
-    out = gr.Textbox(label="Contenu colonne 2", lines=15)
-    btn = gr.Button("Extraire")
-    btn.click(fn=extract_column, inputs=img, outputs=out)
-demo.launch()

 import gradio as gr
 import numpy as np
+import unicodedata
 from paddleocr import PaddleOCR
+from sklearn.cluster import KMeans
+# -------------------------------------------------
+# OCR
+# -------------------------------------------------
+ocr = PaddleOCR(
+    lang="fr",
+    use_textline_orientation=True
+)
+# -------------------------------------------------
+# Normalisation texte (casse + accents)
+# -------------------------------------------------
+def normalize(text: str) -> str:
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
+    return " ".join(text.split())
+# -------------------------------------------------
+# Titres valides de la colonne 2
+# -------------------------------------------------
+COL_TITLES = {
     "designation",
     "designations",
     "description",
     "description des services"
+}
+# -------------------------------------------------
+# Mots / lignes à ignorer
+# -------------------------------------------------
+IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva",
+    "ttc", "general", "generale"
+}
+# -------------------------------------------------
+# Fonction principale
+# -------------------------------------------------
+def extract_second_column(image):
+    if image is None:
+        return "Aucune image fournie."
+    img = np.array(image)
+    result = ocr.predict(img)
+    if not result:
+        return "OCR : aucun texte détecté."
+    data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
+    blocks = []
+    for text, box in zip(texts, boxes):
+        t = text.strip()
+        if len(t) < 2:
+            continue
         x = np.mean([p[0] for p in box])
         y = np.mean([p[1] for p in box])
+        blocks.append((t, x, y))
+    if len(blocks) < 5:
+        return "Pas assez de texte exploitable."
+    # -------------------------------------------------
+    # 1. Détection du X de la colonne cible via son titre
+    # -------------------------------------------------
+    col_x = None
+    for text, x, y in blocks:
+        if normalize(text) in COL_TITLES:
+            col_x = x
+            break
+    if col_x is None:
+        return "Titre de la colonne cible non détecté."
+    # -------------------------------------------------
+    # 2. Sélection des blocs proches du X détecté
+    # -------------------------------------------------
+    X_THRESHOLD = 45
+    column_blocks = [
+        (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD
+    ]
+    if not column_blocks:
+        return "Colonne détectée mais vide."
+    # -------------------------------------------------
+    # 3. Tri vertical (haut → bas)
+    # -------------------------------------------------
+    column_blocks.sort(key=lambda e: e[2])
+    # -------------------------------------------------
+    # 4. Fusion intelligente des lignes OCR
+    # -------------------------------------------------
+    merged = []
+    current = ""
+    last_y = None
+    Y_THRESHOLD = 22
+    for text, x, y in column_blocks:
+        nt = normalize(text)
+        if any(k in nt for k in IGNORE_KEYWORDS):
+            continue
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
+            if current:
+                merged.append(current.strip())
+            current = text
         else:
+            current += " " + text
+        last_y = y
+    if current:
+        merged.append(current.strip())
+    # -------------------------------------------------
+    # 5. Nettoyage final (cellules texte uniquement)
+    # -------------------------------------------------
+    final = []
+    for line in merged:
+        nt = normalize(line)
+        if len(nt) < 4:
+            continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
+            continue
+        final.append(line)
+    if not final:
+        return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 6. Résultat numéroté
+    # -------------------------------------------------
+    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
+# -------------------------------------------------
+# Interface Gradio
+# -------------------------------------------------
+demo = gr.Interface(
+    fn=extract_second_column,
+    inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu de la colonne 2"),
+    title="Extraction fiable de la colonne 2 (Désignation / Description)",
+    description=(
+        "Extraction robuste de la deuxième colonne des tableaux scannés "
+        "(Désignation, DESIGNATIONS, Description, Description des services)."
+    )
+)
+demo.launch(server_name="0.0.0.0", server_port=7860)