Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 30, 2025

Commit

c794a72

verified ·

1 Parent(s): 3cdcef1

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -67

app.py CHANGED Viewed

@@ -3,9 +3,12 @@ import cv2
 import numpy as np
 from paddleocr import PaddleOCR
 from PIL import Image
-# ✅ Configuration la plus compatible (CPU / Hugging Face)
 ocr = PaddleOCR(lang="en")
@@ -13,128 +16,131 @@ def extract_description_column(image: Image.Image):
     if image is None:
         return "❌ Aucune image fournie."
-    # Conversion image
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    # OCR
-    result = ocr.ocr(img)
-    if not result or not result[0]:
         return "❌ Aucun texte détecté."
     words = []
-    # 1️⃣ Collecte OCR
-    for item in result[0]:
-        try:
-            box = item[0]
-            text = item[1][0]
-            score = item[1][1]
-        except Exception:
-            continue
-        # Sécurisation du score
-        try:
-            score = float(score)
-        except:
-            score = 1.0
-        if score < 0.5 or not str(text).strip():
-            continue
-        xs = [p[0] for p in box]
-        ys = [p[1] for p in box]
-        words.append({
-            "text": str(text).strip(),
-            "x": min(xs),
-            "y": min(ys),
-            "w": max(xs) - min(xs),
-            "h": max(ys) - min(ys),
-        })
-    # 2️⃣ Détection header "Description"
-    header = next(
-        (w for w in words if "description" in w["text"].lower()),
-        None
-    )
-    if header is None:
-        return "❌ Colonne 'Description' introuvable."
-    # 3️⃣ Zone colonne Description (adaptée facture)
-    x_min = header["x"] - 10
-    x_max = header["x"] + header["w"] + 450
-    y_min = header["y"] + header["h"] + 10
     column_words = [
         w for w in words
-        if x_min <= w["x"] <= x_max and w["y"] > y_min
     ]
     if not column_words:
-        return "⚠️ Aucun contenu détecté sous la colonne Description."
     # 4️⃣ Regroupement par lignes visuelles
     lines = {}
     for w in column_words:
-        key = int(w["y"] // 20)
         lines.setdefault(key, []).append(w)
     ordered_lines = []
-    for k in sorted(lines.keys()):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         ordered_lines.append(line)
-    # 5️⃣ Nettoyage (prix, VAT, etc.)
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
-        if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
             continue
         if line.replace(".", "").replace(",", "").isdigit():
             continue
         cleaned.append(line)
-    # 6️⃣ Fusion multilignes (cellules)
-    final_cells = []
     buffer = ""
     for line in cleaned:
         if line[:2].replace(".", "").isdigit():
             if buffer:
-                final_cells.append(buffer.strip())
             buffer = line.split(".", 1)[-1].strip()
         else:
             buffer += " " + line
     if buffer:
-        final_cells.append(buffer.strip())
-    # Format affichage
     output = ""
-    for i, cell in enumerate(final_cells, 1):
         output += f"{i}. {cell}\n\n"
     return output.strip()
-# 🎛️ Interface Gradio
 demo = gr.Interface(
     fn=extract_description_column,
-    inputs=gr.Image(type="pil", label="Image de facture / tableau"),
-    outputs=gr.Textbox(lines=18, label="Contenu de la colonne Description"),
-    title="Extraction de la colonne Description (PaddleOCR)",
-    description=(
-        "Upload une image de facture contenant un tableau.\n"
-        "L'application extrait automatiquement tous les éléments "
-        "de la colonne 'Description', cellule par cellule."
-    ),
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import numpy as np
 from paddleocr import PaddleOCR
 from PIL import Image
+import os
+# Sécurité HF
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
 ocr = PaddleOCR(lang="en")
     if image is None:
         return "❌ Aucune image fournie."
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    result = ocr.ocr(img, cls=False)
+    if not result:
         return "❌ Aucun texte détecté."
     words = []
+    # OCR → blocs normalisés
+    for line in result:
+        for item in line:
+            box, (text, score) = item
+            if not text.strip():
+                continue
+            xs = [p[0] for p in box]
+            ys = [p[1] for p in box]
+            words.append({
+                "text": text.strip(),
+                "x": min(xs),
+                "y": min(ys),
+                "w": max(xs) - min(xs),
+                "h": max(ys) - min(ys),
+            })
+    if not words:
+        return "❌ OCR vide."
+    # ----------------------------
+    # 1️⃣ Détection ligne header (celle avec No / Description / Qty)
+    # ----------------------------
+    header_y = min(
+        w["y"] for w in words
+        if any(k in w["text"].lower() for k in ["no", "qty", "description"])
+    )
+    header_words = [w for w in words if abs(w["y"] - header_y) < 15]
+    header_words = sorted(header_words, key=lambda x: x["x"])
+    if len(header_words) < 3:
+        return "❌ Header du tableau non détecté."
+    # ----------------------------
+    # 2️⃣ Colonne Description = entre No. et Qty
+    # ----------------------------
+    # No. → colonne 1
+    # Description → colonne 2
+    # Qty → colonne 3
+    x_min = header_words[1]["x"] - 10
+    x_max = header_words[2]["x"] - 10
+    # ----------------------------
+    # 3️⃣ Mots sous la colonne
+    # ----------------------------
     column_words = [
         w for w in words
+        if x_min <= w["x"] <= x_max and w["y"] > header_y + 20
     ]
     if not column_words:
+        return "⚠️ Aucun texte trouvé dans la colonne Description."
+    # ----------------------------
     # 4️⃣ Regroupement par lignes visuelles
+    # ----------------------------
     lines = {}
     for w in column_words:
+        key = int(w["y"] // 18)
         lines.setdefault(key, []).append(w)
     ordered_lines = []
+    for k in sorted(lines):
         line = " ".join(
             w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
         ordered_lines.append(line)
+    # ----------------------------
+    # 5️⃣ Nettoyage (prix / VAT / unités)
+    # ----------------------------
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
+        if any(x in low for x in ["vat", "each", "%"]):
             continue
         if line.replace(".", "").replace(",", "").isdigit():
             continue
         cleaned.append(line)
+    # ----------------------------
+    # 6️⃣ Fusion multi-lignes (cellules)
+    # ----------------------------
+    cells = []
     buffer = ""
     for line in cleaned:
         if line[:2].replace(".", "").isdigit():
             if buffer:
+                cells.append(buffer.strip())
             buffer = line.split(".", 1)[-1].strip()
         else:
             buffer += " " + line
     if buffer:
+        cells.append(buffer.strip())
+    # ----------------------------
+    # Résultat final
+    # ----------------------------
     output = ""
+    for i, cell in enumerate(cells, 1):
         output += f"{i}. {cell}\n\n"
     return output.strip()
 demo = gr.Interface(
     fn=extract_description_column,
+    inputs=gr.Image(type="pil", label="Image de facture"),
+    outputs=gr.Textbox(lines=18, label="Colonne Description"),
+    title="Extraction colonne Description – PaddleOCR",
+    description="Extraction robuste de la 2ᵉ colonne (Description) des factures."
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)