Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 29, 2025

Commit

f7bee90

verified ·

1 Parent(s): 8d4694b

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -44

app.py CHANGED Viewed

@@ -1,43 +1,38 @@
 import gradio as gr
 import cv2
 import numpy as np
-from PIL import Image
 from paddleocr import PaddleOCR
-ocr = PaddleOCR(
-    use_angle_cls=True,
-    lang="en"
-)
-def extract_descriptions(image: Image.Image):
     if image is None:
-        return "Aucune image fournie."
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     result = ocr.ocr(img)
     words = []
-    for line in result[0]:
-        text, score, box = None, None, None
-        # Parsing défensif PaddleOCR
-        if isinstance(line, (list, tuple)):
-            if len(line) >= 2 and isinstance(line[1], (list, tuple)):
-                box = line[0]
-                text = line[1][0]
-                score = line[1][1]
-            elif len(line) >= 3:
-                box = line[0]
-                text = line[1]
-                score = line[2]
-        if box is None or text is None:
             continue
-        # 🔒 Sécurisation du score
         try:
             score = float(score)
         except:
@@ -57,18 +52,18 @@ def extract_descriptions(image: Image.Image):
             "h": max(ys) - min(ys),
         })
-    # 1️⃣ Détecter la colonne Description
     header = next(
         (w for w in words if "description" in w["text"].lower()),
         None
     )
     if header is None:
-        return "❌ Colonne 'Description' non détectée."
-    # 2️⃣ Zone de la colonne
-    x_min = header["x"] - 15
-    x_max = header["x"] + header["w"] + 380
     y_min = header["y"] + header["h"] + 10
     column_words = [
@@ -76,49 +71,70 @@ def extract_descriptions(image: Image.Image):
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
-    # 3️⃣ Regroupement par lignes
     lines = {}
     for w in column_words:
-        key = int(w["y"] // 18)
         lines.setdefault(key, []).append(w)
-    raw_lines = []
-    for k in sorted(lines):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         low = line.lower()
-        if any(x in low for x in ["vat", "gross", "net", "total", "each"]):
             continue
         if line.replace(".", "").replace(",", "").isdigit():
             continue
-        raw_lines.append(line)
-    # 4️⃣ Fusion multilignes
-    final = []
     buffer = ""
-    for line in raw_lines:
         if line[:2].replace(".", "").isdigit():
             if buffer:
-                final.append(buffer.strip())
             buffer = line.split(".", 1)[-1].strip()
         else:
             buffer += " " + line
     if buffer:
-        final.append(buffer.strip())
-    return "\n".join(final) if final else "⚠️ Aucun texte extrait."
 demo = gr.Interface(
-    fn=extract_descriptions,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(lines=20),
-    title="Extraction colonne Description – PaddleOCR (Production Safe)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import cv2
 import numpy as np
 from paddleocr import PaddleOCR
+from PIL import Image
+# ✅ Configuration la plus compatible (CPU / Hugging Face)
+ocr = PaddleOCR(lang="en")
+def extract_description_column(image: Image.Image):
     if image is None:
+        return "❌ Aucune image fournie."
+    # Conversion image
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # OCR
     result = ocr.ocr(img)
+    if not result or not result[0]:
+        return "❌ Aucun texte détecté."
     words = []
+    # 1️⃣ Collecte OCR
+    for item in result[0]:
+        try:
+            box = item[0]
+            text = item[1][0]
+            score = item[1][1]
+        except Exception:
             continue
+        # Sécurisation du score
         try:
             score = float(score)
         except:
             "h": max(ys) - min(ys),
         })
+    # 2️⃣ Détection header "Description"
     header = next(
         (w for w in words if "description" in w["text"].lower()),
         None
     )
     if header is None:
+        return "❌ Colonne 'Description' introuvable."
+    # 3️⃣ Zone colonne Description (adaptée facture)
+    x_min = header["x"] - 10
+    x_max = header["x"] + header["w"] + 450
     y_min = header["y"] + header["h"] + 10
     column_words = [
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
+    if not column_words:
+        return "⚠️ Aucun contenu détecté sous la colonne Description."
+    # 4️⃣ Regroupement par lignes visuelles
     lines = {}
     for w in column_words:
+        key = int(w["y"] // 20)
         lines.setdefault(key, []).append(w)
+    ordered_lines = []
+    for k in sorted(lines.keys()):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
+        ordered_lines.append(line)
+    # 5️⃣ Nettoyage (prix, VAT, etc.)
+    cleaned = []
+    for line in ordered_lines:
         low = line.lower()
+        if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
             continue
         if line.replace(".", "").replace(",", "").isdigit():
             continue
+        cleaned.append(line)
+    # 6️⃣ Fusion multilignes (cellules)
+    final_cells = []
     buffer = ""
+    for line in cleaned:
         if line[:2].replace(".", "").isdigit():
             if buffer:
+                final_cells.append(buffer.strip())
             buffer = line.split(".", 1)[-1].strip()
         else:
             buffer += " " + line
     if buffer:
+        final_cells.append(buffer.strip())
+    # Format affichage
+    output = ""
+    for i, cell in enumerate(final_cells, 1):
+        output += f"{i}. {cell}\n\n"
+    return output.strip()
+# 🎛️ Interface Gradio
 demo = gr.Interface(
+    fn=extract_description_column,
+    inputs=gr.Image(type="pil", label="Image de facture / tableau"),
+    outputs=gr.Textbox(lines=18, label="Contenu de la colonne Description"),
+    title="Extraction de la colonne Description (PaddleOCR)",
+    description=(
+        "Upload une image de facture contenant un tableau.\n"
+        "L'application extrait automatiquement tous les éléments "
+        "de la colonne 'Description', cellule par cellule."
+    ),
+    allow_flagging="never"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)