Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 29, 2025

Commit

4acaff2

verified ·

1 Parent(s): 3540c20

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -35

app.py CHANGED Viewed

@@ -4,11 +4,10 @@ import numpy as np
 from PIL import Image
 from paddleocr import PaddleOCR
-# Initialisation OCR (CPU, stable HF)
 ocr = PaddleOCR(
     use_angle_cls=True,
-    lang="en"
 )
@@ -18,27 +17,42 @@ def extract_descriptions(image: Image.Image):
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    # OCR Paddle
     result = ocr.ocr(img)
     words = []
     for line in result[0]:
-        box, (text, score) = line
-        if score < 0.5:
             continue
-        x_coords = [p[0] for p in box]
-        y_coords = [p[1] for p in box]
         words.append({
             "text": text.strip(),
-            "x": min(x_coords),
-            "y": min(y_coords),
-            "w": max(x_coords) - min(x_coords),
-            "h": max(y_coords) - min(y_coords)
         })
-    # 1️⃣ Détecter l'en-tête "Description"
     header = next(
         (w for w in words if "description" in w["text"].lower()),
         None
@@ -57,7 +71,7 @@ def extract_descriptions(image: Image.Image):
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
-    # 3️⃣ Grouper par lignes
     lines = {}
     for w in column_words:
         key = int(w["y"] // 18)
@@ -69,7 +83,6 @@ def extract_descriptions(image: Image.Image):
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
-        # Filtrage facture
         low = line.lower()
         if any(x in low for x in ["vat", "gross", "net", "total", "each"]):
             continue
@@ -78,7 +91,7 @@ def extract_descriptions(image: Image.Image):
         raw_lines.append(line)
-    # 4️⃣ Fusion cellules multilignes
     final = []
     buffer = ""
@@ -93,28 +106,14 @@ def extract_descriptions(image: Image.Image):
     if buffer:
         final.append(buffer.strip())
-    if not final:
-        return "⚠️ Aucun texte extrait."
-    return "\n".join(final)
-# =========================
-# Interface Gradio
-# =========================
 demo = gr.Interface(
     fn=extract_descriptions,
-    inputs=gr.Image(type="pil", label="Image de facture"),
-    outputs=gr.Textbox(lines=20, label="Descriptions extraites"),
-    title="Extraction colonne Description – PaddleOCR",
-    description=(
-        "OCR robuste basé sur PaddleOCR. "
-        "Extraction automatique des cellules de la colonne Description."
-    )
 )
-demo.launch(
-    server_name="0.0.0.0",
-    server_port=7860
-)

 from PIL import Image
 from paddleocr import PaddleOCR
 ocr = PaddleOCR(
     use_angle_cls=True,
+    lang="en",
+    use_gpu=False
 )
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     result = ocr.ocr(img)
     words = []
+    # 🔴 PARSING ROBUSTE PaddleOCR
     for line in result[0]:
+        # Cas 1 : [box, (text, score)]
+        if len(line) >= 2 and isinstance(line[1], (list, tuple)):
+            box = line[0]
+            text = line[1][0]
+            score = line[1][1]
+        # Cas 2 : [box, text, score]
+        elif len(line) >= 3:
+            box = line[0]
+            text = line[1]
+            score = line[2]
+        else:
+            continue
+        if score < 0.5 or not text.strip():
             continue
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
         words.append({
             "text": text.strip(),
+            "x": min(xs),
+            "y": min(ys),
+            "w": max(xs) - min(xs),
+            "h": max(ys) - min(ys),
         })
+    # 1️⃣ Détecter la colonne Description
     header = next(
         (w for w in words if "description" in w["text"].lower()),
         None
         if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
+    # 3️⃣ Regroupement par lignes
     lines = {}
     for w in column_words:
         key = int(w["y"] // 18)
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         low = line.lower()
         if any(x in low for x in ["vat", "gross", "net", "total", "each"]):
             continue
         raw_lines.append(line)
+    # 4️⃣ Fusion multilignes
     final = []
     buffer = ""
     if buffer:
         final.append(buffer.strip())
+    return "\n".join(final) if final else "⚠️ Aucun texte extrait."
 demo = gr.Interface(
     fn=extract_descriptions,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(lines=20),
+    title="Extraction colonne Description – PaddleOCR (Stable)"
 )
+demo.launch(server_name="0.0.0.0", server_port=7860)