Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 30, 2025

Commit

cef6308

verified ·

1 Parent(s): e76b20c

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -76

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import gradio as gr
 import cv2
 import numpy as np
@@ -5,7 +9,11 @@ from paddleocr import PaddleOCR
 from PIL import Image
-ocr = PaddleOCR(lang="en")
 def extract_description_column(image: Image.Image):
@@ -14,99 +22,76 @@ def extract_description_column(image: Image.Image):
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    result = ocr.ocr(img, cls=False)
-    if not result:
         return "❌ Aucun texte détecté."
     words = []
-    # OCR → blocs normalisés
-    for line in result:
-        for item in line:
-            box, (text, score) = item
-            if not text.strip():
-                continue
-            xs = [p[0] for p in box]
-            ys = [p[1] for p in box]
-            words.append({
-                "text": text.strip(),
-                "x": min(xs),
-                "y": min(ys),
-                "w": max(xs) - min(xs),
-                "h": max(ys) - min(ys),
-            })
-    if not words:
-        return "❌ OCR vide."
-    # ----------------------------
-    # 1️⃣ Détection ligne header (celle avec No / Description / Qty)
-    # ----------------------------
-    header_y = min(
-        w["y"] for w in words
-        if any(k in w["text"].lower() for k in ["no", "qty", "description"])
-    )
-    header_words = [w for w in words if abs(w["y"] - header_y) < 15]
-    header_words = sorted(header_words, key=lambda x: x["x"])
-    if len(header_words) < 3:
-        return "❌ Header du tableau non détecté."
-    # ----------------------------
-    # 2️⃣ Colonne Description = entre No. et Qty
-    # ----------------------------
-    # No. → colonne 1
-    # Description → colonne 2
-    # Qty → colonne 3
-    x_min = header_words[1]["x"] - 10
-    x_max = header_words[2]["x"] - 10
-    # ----------------------------
-    # 3️⃣ Mots sous la colonne
-    # ----------------------------
-    column_words = [
         w for w in words
-        if x_min <= w["x"] <= x_max and w["y"] > header_y + 20
     ]
-    if not column_words:
-        return "⚠️ Aucun texte trouvé dans la colonne Description."
-    # ----------------------------
-    # 4️⃣ Regroupement par lignes visuelles
-    # ----------------------------
     lines = {}
-    for w in column_words:
-        key = int(w["y"] // 18)
         lines.setdefault(key, []).append(w)
     ordered_lines = []
-    for k in sorted(lines):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         ordered_lines.append(line)
-    # ----------------------------
-    # 5️⃣ Nettoyage (prix / VAT / unités)
-    # ----------------------------
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
-        if any(x in low for x in ["vat", "each", "%"]):
-            continue
-        if line.replace(".", "").replace(",", "").isdigit():
             continue
         cleaned.append(line)
-    # ----------------------------
-    # 6️⃣ Fusion multi-lignes (cellules)
-    # ----------------------------
     cells = []
     buffer = ""
@@ -121,9 +106,7 @@ def extract_description_column(image: Image.Image):
     if buffer:
         cells.append(buffer.strip())
-    # ----------------------------
-    # Résultat final
-    # ----------------------------
     output = ""
     for i, cell in enumerate(cells, 1):
         output += f"{i}. {cell}\n\n"
@@ -135,8 +118,8 @@ demo = gr.Interface(
     fn=extract_description_column,
     inputs=gr.Image(type="pil", label="Image de facture"),
     outputs=gr.Textbox(lines=18, label="Colonne Description"),
-    title="Extraction colonne Description – PaddleOCR",
-    description="Extraction robuste de la 2ᵉ colonne (Description) des factures."
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
+ocr = PaddleOCR(
+    lang="en",
+    use_gpu=False,
+    show_log=False
+)
 def extract_description_column(image: Image.Image):
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    result = ocr.ocr(img)
+    if not result or not result[0]:
         return "❌ Aucun texte détecté."
     words = []
+    # 1️⃣ OCR → mots avec positions
+    for item in result[0]:
+        box, (text, score) = item
+        try:
+            score = float(score)
+        except:
+            score = 1.0
+        if score < 0.4 or not text.strip():
+            continue
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        words.append({
+            "text": text.strip(),
+            "x": min(xs),
+            "y": min(ys),
+            "w": max(xs) - min(xs),
+            "h": max(ys) - min(ys),
+        })
+    # 2️⃣ Détection colonnes No / Qty / UM
+    no_col = [w for w in words if w["text"].lower().startswith("no")]
+    qty_col = [w for w in words if "qty" in w["text"].lower()]
+    if not no_col or not qty_col:
+        return "❌ Structure de tableau non reconnue."
+    x_left = min(w["x"] for w in no_col) + 40
+    x_right = min(w["x"] for w in qty_col) - 10
+    y_start = min(w["y"] for w in no_col) + 40
+    # 3️⃣ Extraction zone Description
+    desc_words = [
         w for w in words
+        if x_left <= w["x"] <= x_right and w["y"] > y_start
     ]
+    if not desc_words:
+        return "⚠️ Aucun texte détecté dans la colonne Description."
+    # 4️⃣ Regroupement par lignes
     lines = {}
+    for w in desc_words:
+        key = int(w["y"] // 25)
         lines.setdefault(key, []).append(w)
     ordered_lines = []
+    for k in sorted(lines.keys()):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         ordered_lines.append(line)
+    # 5️⃣ Nettoyage
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
+        if any(x in low for x in ["each", "vat", "net", "gross", "%"]):
             continue
         cleaned.append(line)
+    # 6️⃣ Fusion cellules multilignes
     cells = []
     buffer = ""
     if buffer:
         cells.append(buffer.strip())
+    # 7️⃣ Format sortie
     output = ""
     for i, cell in enumerate(cells, 1):
         output += f"{i}. {cell}\n\n"
     fn=extract_description_column,
     inputs=gr.Image(type="pil", label="Image de facture"),
     outputs=gr.Textbox(lines=18, label="Colonne Description"),
+    title="Extraction colonne Description – Factures",
+    description="Extraction automatique et robuste de la colonne Description"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)