Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 29, 2025

Commit

68c9a14

verified ·

1 Parent(s): 8c6b76a

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -124

app.py CHANGED Viewed

@@ -1,139 +1,65 @@
 import gradio as gr
 import numpy as np
-from paddleocr import PaddleOCR
-from sklearn.cluster import KMeans
-# -----------------------------
-# OCR
-# -----------------------------
-ocr = PaddleOCR(
-    use_textline_orientation=True,
-    lang="fr"
-)
-# -----------------------------
-# Fonction principale
-# -----------------------------
-def extract_column2_9_lines(image):
-    if image is None:
-        return "Aucune image fournie."
-    img = np.array(image)
-    result = ocr.predict(img)
-    if not result or len(result) == 0:
-        return "OCR exécuté mais aucun texte détecté."
-    data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
-    if not texts:
-        return "Aucun texte exploitable détecté."
-    # -----------------------------
-    # 1. Collecte OCR
-    # -----------------------------
-    elements = []
-    for text, box in zip(texts, boxes):
-        text = text.strip()
-        if len(text) < 3:
-            continue
-        x_center = np.mean([p[0] for p in box])
-        y_center = np.mean([p[1] for p in box])
-        elements.append((x_center, y_center, text))
-    if len(elements) < 5:
-        return "Pas assez de texte détecté."
-    # -----------------------------
-    # 2. Clustering horizontal ADAPTATIF
-    # -----------------------------
-    X = np.array([[e[0]] for e in elements])
-    n_clusters = min(8, max(3, len(elements) // 8))
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X)
-    columns = {}
-    for (x, y, text), label in zip(elements, labels):
-        columns.setdefault(label, []).append((x, y, text))
-    # -----------------------------
-    # 3. Choisir la colonne "Description"
-    # => la plus riche en texte non numérique
-    # -----------------------------
-    def column_score(col):
-        score = 0
-        for _, _, t in col:
-            if not any(char.isdigit() for char in t):
-                score += len(t)
-        return score
-    best_column = max(columns.values(), key=column_score)
-    # Tri vertical
-    best_column.sort(key=lambda e: e[1])
-    # -----------------------------
-    # 4. Fusion intelligente des lignes
-    # -----------------------------
-    merged_lines = []
-    current_text = ""
-    last_y = None
-    Y_THRESHOLD = 22
-    blacklist = (
-        "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
-        "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
     )
-    for _, y, text in best_column:
-        if text.upper().startswith(blacklist):
-            continue
-        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
-            if current_text:
-                merged_lines.append(current_text.strip())
-            current_text = text
-        else:
-            current_text += " " + text
-        last_y = y
-    if current_text:
-        merged_lines.append(current_text.strip())
-    # -----------------------------
-    # 5. Nettoyage final
-    # -----------------------------
-    cleaned = []
-    for line in merged_lines:
-        if len(line) < 5:
-            continue
-        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
-        cleaned.append(line)
-    final_lines = cleaned[:9]
-    if not final_lines:
-        return "Colonne détectée mais contenu non exploitable."
-    # Numérotation demandée
-    return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
-# -----------------------------
-# Interface Gradio
-# -----------------------------
 demo = gr.Interface(
-    fn=extract_column2_9_lines,
-    inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
-    title="Extraction robuste de la colonne Description",
-    description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from PIL import Image
+import cv2
+import pytesseract
 import numpy as np
+pytesseract.pytesseract.tesseract_cmd = "tesseract"
+def extract_descriptions(image: Image.Image):
+    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    data = pytesseract.image_to_data(
+        img,
+        output_type=pytesseract.Output.DICT,
+        config="--psm 6"
     )
+    words = []
+    for i in range(len(data["text"])):
+        txt = data["text"][i].strip()
+        if txt:
+            words.append({
+                "text": txt,
+                "x": data["left"][i],
+                "y": data["top"][i],
+                "w": data["width"][i],
+                "h": data["height"][i],
+            })
+    header = next((w for w in words if w["text"].lower() == "description"), None)
+    if not header:
+        return "Colonne 'Description' non détectée"
+    x_min = header["x"] - 10
+    x_max = header["x"] + header["w"] + 350
+    y_min = header["y"] + header["h"] + 10
+    col_words = [
+        w for w in words
+        if x_min <= w["x"] <= x_max and w["y"] > y_min
+    ]
+    lines = {}
+    for w in col_words:
+        key = w["y"] // 15
+        lines.setdefault(key, []).append(w)
+    results = []
+    for k in sorted(lines):
+        line = " ".join(w["text"] for w in sorted(lines[k], key=lambda x: x["x"]))
+        if any(x in line.lower() for x in ["vat", "gross", "net", "each"]):
             continue
+        results.append(line)
+    return "\n".join(results)
 demo = gr.Interface(
+    fn=extract_descriptions,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(lines=20),
+    title="Extraction colonne Description – Factures"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)