Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 30, 2025

Commit

9bbca4f

verified ·

1 Parent(s): ec72508

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -29

app.py CHANGED Viewed

@@ -9,9 +9,7 @@ from paddleocr import PaddleOCR
 from PIL import Image
-ocr = PaddleOCR(
-    lang="en"
-)
 def extract_description_column(image: Image.Image):
@@ -19,14 +17,14 @@ def extract_description_column(image: Image.Image):
         return "❌ Aucune image fournie."
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     result = ocr.ocr(img)
     if not result or not result[0]:
         return "❌ Aucun texte détecté."
     words = []
-    # 1️⃣ OCR → mots avec positions
     for item in result[0]:
         box, (text, score) = item
         try:
@@ -48,29 +46,44 @@ def extract_description_column(image: Image.Image):
             "h": max(ys) - min(ys),
         })
-    # 2️⃣ Détection colonnes No / Qty / UM
-    no_col = [w for w in words if w["text"].lower().startswith("no")]
-    qty_col = [w for w in words if "qty" in w["text"].lower()]
-    if not no_col or not qty_col:
-        return "❌ Structure de tableau non reconnue."
-    x_left = min(w["x"] for w in no_col) + 40
-    x_right = min(w["x"] for w in qty_col) - 10
-    y_start = min(w["y"] for w in no_col) + 40
-    # 3️⃣ Extraction zone Description
-    desc_words = [
-        w for w in words
-        if x_left <= w["x"] <= x_right and w["y"] > y_start
-    ]
-    if not desc_words:
-        return "⚠️ Aucun texte détecté dans la colonne Description."
-    # 4️⃣ Regroupement par lignes
     lines = {}
-    for w in desc_words:
         key = int(w["y"] // 25)
         lines.setdefault(key, []).append(w)
@@ -81,15 +94,15 @@ def extract_description_column(image: Image.Image):
         )
         ordered_lines.append(line)
-    # 5️⃣ Nettoyage
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
-        if any(x in low for x in ["each", "vat", "net", "gross", "%"]):
             continue
         cleaned.append(line)
-    # 6️⃣ Fusion cellules multilignes
     cells = []
     buffer = ""
@@ -104,7 +117,7 @@ def extract_description_column(image: Image.Image):
     if buffer:
         cells.append(buffer.strip())
-    # 7️⃣ Format sortie
     output = ""
     for i, cell in enumerate(cells, 1):
         output += f"{i}. {cell}\n\n"
@@ -115,9 +128,9 @@ def extract_description_column(image: Image.Image):
 demo = gr.Interface(
     fn=extract_description_column,
     inputs=gr.Image(type="pil", label="Image de facture"),
-    outputs=gr.Textbox(lines=18, label="Colonne Description"),
-    title="Extraction colonne Description – Factures",
-    description="Extraction automatique et robuste de la colonne Description"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 from PIL import Image
+ocr = PaddleOCR(lang="en", use_gpu=False, show_log=False)
 def extract_description_column(image: Image.Image):
         return "❌ Aucune image fournie."
     img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     result = ocr.ocr(img)
     if not result or not result[0]:
         return "❌ Aucun texte détecté."
     words = []
+    # 1️⃣ OCR words
     for item in result[0]:
         box, (text, score) = item
         try:
             "h": max(ys) - min(ys),
         })
+    # 2️⃣ Trouver le début du tableau ("ITEMS")
+    table_start_y = None
+    for w in words:
+        if "item" in w["text"].lower():
+            table_start_y = w["y"]
+            break
+    if table_start_y is None:
+        table_start_y = 0  # fallback
+    table_words = [w for w in words if w["y"] > table_start_y + 30]
+    # 3️⃣ Regrouper par colonnes X
+    columns = {}
+    for w in table_words:
+        col_key = int(w["x"] // 50)
+        columns.setdefault(col_key, []).append(w)
+    # 4️⃣ Identifier la colonne Description
+    best_col = None
+    best_score = 0
+    for col in columns.values():
+        text_len = sum(len(w["text"]) for w in col)
+        numeric_ratio = sum(any(c.isdigit() for c in w["text"]) for w in col) / max(len(col), 1)
+        score = text_len * (1 - numeric_ratio)
+        if score > best_score:
+            best_score = score
+            best_col = col
+    if best_col is None:
+        return "❌ Impossible d’identifier la colonne Description."
+    # 5️⃣ Regrouper par lignes
     lines = {}
+    for w in best_col:
         key = int(w["y"] // 25)
         lines.setdefault(key, []).append(w)
         )
         ordered_lines.append(line)
+    # 6️⃣ Nettoyage
     cleaned = []
     for line in ordered_lines:
         low = line.lower()
+        if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
             continue
         cleaned.append(line)
+    # 7️⃣ Fusion multilignes
     cells = []
     buffer = ""
     if buffer:
         cells.append(buffer.strip())
+    # 8️⃣ Sortie
     output = ""
     for i, cell in enumerate(cells, 1):
         output += f"{i}. {cell}\n\n"
 demo = gr.Interface(
     fn=extract_description_column,
     inputs=gr.Image(type="pil", label="Image de facture"),
+    outputs=gr.Textbox(lines=20, label="Colonne Description"),
+    title="Extraction robuste de la colonne Description",
+    description="Fonctionne sans dépendre des headers OCR"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)