Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 29, 2025

Commit

a6d8f18

verified ·

1 Parent(s): 4921bf2

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -17

app.py CHANGED Viewed

@@ -4,11 +4,22 @@ import cv2
 import pytesseract
 import numpy as np
-pytesseract.pytesseract.tesseract_cmd = "tesseract"
 def extract_descriptions(image: Image.Image):
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     data = pytesseract.image_to_data(
         img,
         output_type=pytesseract.Output.DICT,
@@ -24,42 +35,88 @@ def extract_descriptions(image: Image.Image):
                 "x": data["left"][i],
                 "y": data["top"][i],
                 "w": data["width"][i],
-                "h": data["height"][i],
             })
-    header = next((w for w in words if w["text"].lower() == "description"), None)
-    if not header:
-        return "Colonne 'Description' non détectée"
     x_min = header["x"] - 10
     x_max = header["x"] + header["w"] + 350
     y_min = header["y"] + header["h"] + 10
-    col_words = [
         w for w in words
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
     lines = {}
-    for w in col_words:
         key = w["y"] // 15
         lines.setdefault(key, []).append(w)
-    results = []
-    for k in sorted(lines):
-        line = " ".join(w["text"] for w in sorted(lines[k], key=lambda x: x["x"]))
-        if any(x in line.lower() for x in ["vat", "gross", "net", "each"]):
             continue
-        results.append(line)
-    return "\n".join(results)
 demo = gr.Interface(
     fn=extract_descriptions,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(lines=20),
-    title="Extraction colonne Description – Factures"
 )
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import pytesseract
 import numpy as np
+# 🔴 IMPORTANT : chemin ABSOLU vers tesseract (HF Docker)
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 def extract_descriptions(image: Image.Image):
+    """
+    Extrait uniquement le contenu de la colonne 'Description'
+    depuis une image de facture (tableau).
+    """
+    if image is None:
+        return "Aucune image fournie."
+    # Conversion PIL -> OpenCV
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # OCR avec positions
     data = pytesseract.image_to_data(
         img,
         output_type=pytesseract.Output.DICT,
                 "x": data["left"][i],
                 "y": data["top"][i],
                 "w": data["width"][i],
+                "h": data["height"][i]
             })
+    # 1️⃣ Détection de l'en-tête "Description"
+    header = next(
+        (w for w in words if w["text"].lower() == "description"),
+        None
+    )
+    if header is None:
+        return "❌ Colonne 'Description' non détectée."
+    # 2️⃣ Définition de la zone de la colonne Description
     x_min = header["x"] - 10
     x_max = header["x"] + header["w"] + 350
     y_min = header["y"] + header["h"] + 10
+    # 3️⃣ Filtrage des mots dans cette colonne
+    column_words = [
         w for w in words
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
+    # 4️⃣ Regroupement par lignes (Y proche)
     lines = {}
+    for w in column_words:
         key = w["y"] // 15
         lines.setdefault(key, []).append(w)
+    extracted_lines = []
+    for key in sorted(lines.keys()):
+        line_words = sorted(lines[key], key=lambda x: x["x"])
+        line_text = " ".join(w["text"] for w in line_words)
+        # Filtrage des éléments non désirés
+        if any(k in line_text.lower() for k in ["vat", "gross", "net", "each"]):
+            continue
+        if line_text.replace(".", "").replace(",", "").isdigit():
             continue
+        extracted_lines.append(line_text)
+    # 5️⃣ Fusion des cellules multilignes
+    final_descriptions = []
+    buffer = ""
+    for line in extracted_lines:
+        # Détection de début de nouvelle ligne de cellule (ex: "1.")
+        if line[:2].replace(".", "").isdigit():
+            if buffer:
+                final_descriptions.append(buffer.strip())
+            buffer = line.split(".", 1)[-1].strip()
+        else:
+            buffer += " " + line
+    if buffer:
+        final_descriptions.append(buffer.strip())
+    # Résultat final
+    if not final_descriptions:
+        return "⚠️ Aucun contenu détecté dans la colonne Description."
+    return "\n".join(final_descriptions)
+# =========================
+# Interface Gradio
+# =========================
 demo = gr.Interface(
     fn=extract_descriptions,
+    inputs=gr.Image(type="pil", label="Image de facture"),
+    outputs=gr.Textbox(lines=20, label="Descriptions extraites"),
+    title="Extraction de la colonne Description (Factures)",
+    description=(
+        "Charge une image de facture contenant un tableau "
+        "et récupère uniquement le contenu de la colonne 'Description', "
+        "cellule par cellule."
+    )
 )
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860
+)