Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 30, 2025

Commit

fe5b596

verified ·

1 Parent(s): 3a9c77b

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -103

app.py CHANGED Viewed

@@ -1,136 +1,139 @@
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
 import gradio as gr
 import cv2
 import numpy as np
-from paddleocr import PaddleOCR
 from PIL import Image
-ocr = PaddleOCR(lang="en")
-def extract_description_column(image: Image.Image):
-    if image is None:
-        return "❌ Aucune image fournie."
-    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    result = ocr.ocr(img)
-    if not result or not result[0]:
-        return "❌ Aucun texte détecté."
-    words = []
-    # 1️⃣ OCR words
-    for item in result[0]:
-        box, (text, score) = item
-        try:
-            score = float(score)
-        except:
-            score = 1.0
-        if score < 0.4 or not text.strip():
-            continue
-        xs = [p[0] for p in box]
-        ys = [p[1] for p in box]
-        words.append({
-            "text": text.strip(),
-            "x": min(xs),
-            "y": min(ys),
-            "w": max(xs) - min(xs),
-            "h": max(ys) - min(ys),
-        })
-    # 2️⃣ Trouver le début du tableau ("ITEMS")
-    table_start_y = None
-    for w in words:
-        if "item" in w["text"].lower():
-            table_start_y = w["y"]
-            break
-    if table_start_y is None:
-        table_start_y = 0  # fallback
-    table_words = [w for w in words if w["y"] > table_start_y + 30]
-    # 3️⃣ Regrouper par colonnes X
-    columns = {}
-    for w in table_words:
-        col_key = int(w["x"] // 50)
-        columns.setdefault(col_key, []).append(w)
-    # 4️⃣ Identifier la colonne Description
-    best_col = None
-    best_score = 0
-    for col in columns.values():
-        text_len = sum(len(w["text"]) for w in col)
-        numeric_ratio = sum(any(c.isdigit() for c in w["text"]) for w in col) / max(len(col), 1)
-        score = text_len * (1 - numeric_ratio)
-        if score > best_score:
-            best_score = score
-            best_col = col
-    if best_col is None:
-        return "❌ Impossible d’identifier la colonne Description."
-    # 5️⃣ Regrouper par lignes
-    lines = {}
-    for w in best_col:
-        key = int(w["y"] // 25)
-        lines.setdefault(key, []).append(w)
-    ordered_lines = []
-    for k in sorted(lines.keys()):
-        line = " ".join(
-            w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
         )
-        ordered_lines.append(line)
-    # 6️⃣ Nettoyage
-    cleaned = []
-    for line in ordered_lines:
-        low = line.lower()
-        if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
-            continue
-        cleaned.append(line)
-    # 7️⃣ Fusion multilignes
-    cells = []
-    buffer = ""
-    for line in cleaned:
-        if line[:2].replace(".", "").isdigit():
-            if buffer:
-                cells.append(buffer.strip())
-            buffer = line.split(".", 1)[-1].strip()
-        else:
-            buffer += " " + line
-    if buffer:
-        cells.append(buffer.strip())
-    # 8️⃣ Sortie
-    output = ""
-    for i, cell in enumerate(cells, 1):
-        output += f"{i}. {cell}\n\n"
-    return output.strip()
 demo = gr.Interface(
-    fn=extract_description_column,
     inputs=gr.Image(type="pil", label="Image de facture"),
-    outputs=gr.Textbox(lines=20, label="Colonne Description"),
-    title="Extraction robuste de la colonne Description",
-    description="Fonctionne sans dépendre des headers OCR"
 )
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import torch
 import cv2
+import pytesseract
 import numpy as np
 from PIL import Image
+from transformers import DetrImageProcessor, TableTransformerForObjectDetection
+# ===============================
+# Chargement des modèles
+# ===============================
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+det_processor = DetrImageProcessor.from_pretrained(
+    "microsoft/table-transformer-detection"
+)
+det_model = TableTransformerForObjectDetection.from_pretrained(
+    "microsoft/table-transformer-detection"
+).to(DEVICE)
+struct_processor = DetrImageProcessor.from_pretrained(
+    "microsoft/table-transformer-structure-recognition"
+)
+struct_model = TableTransformerForObjectDetection.from_pretrained(
+    "microsoft/table-transformer-structure-recognition"
+).to(DEVICE)
+# ===============================
+# OCR cellule
+# ===============================
+def ocr_cell(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    text = pytesseract.image_to_string(gray, config="--psm 6")
+    return text.strip()
+# ===============================
+# Fonction principale
+# ===============================
+def extract_description(image_pil):
+    image = np.array(image_pil)
+    h, w, _ = image.shape
+    # ---- Détection du tableau ----
+    inputs = det_processor(images=image_pil, return_tensors="pt").to(DEVICE)
+    outputs = det_model(**inputs)
+    results = det_processor.post_process_object_detection(
+        outputs,
+        threshold=0.8,
+        target_sizes=[(h, w)]
+    )[0]
+    tables = [
+        box for box, label in zip(results["boxes"], results["labels"])
+        if det_model.config.id2label[label.item()] == "table"
+    ]
+    if not tables:
+        return "❌ Aucun tableau détecté", ""
+    table_box = tables[0].int().tolist()
+    x0, y0, x1, y1 = table_box
+    table_img = image[y0:y1, x0:x1]
+    # ---- Structure du tableau ----
+    inputs = struct_processor(images=Image.fromarray(table_img), return_tensors="pt").to(DEVICE)
+    outputs = struct_model(**inputs)
+    results = struct_processor.post_process_object_detection(
+        outputs,
+        threshold=0.7,
+        target_sizes=[table_img.shape[:2]]
+    )[0]
+    cells = []
+    for box, label in zip(results["boxes"], results["labels"]):
+        label_name = struct_model.config.id2label[label.item()]
+        if label_name == "table cell":
+            cells.append(box.int().tolist())
+    if not cells:
+        return "❌ Aucune cellule détectée", ""
+    # ---- Grouper par colonne (X) ----
+    cells_sorted = sorted(cells, key=lambda b: (b[0] + b[2]) / 2)
+    columns = {}
+    for cell in cells_sorted:
+        cx = (cell[0] + cell[2]) // 2
+        columns.setdefault(cx // 50, []).append(cell)
+    columns = list(columns.values())
+    columns = sorted(columns, key=lambda col: np.mean([(c[0]+c[2])/2 for c in col]))
+    # ---- OCR par colonne ----
+    column_texts = []
+    for col in columns:
+        col_text = []
+        for x0, y0, x1, y1 in sorted(col, key=lambda b: b[1]):
+            cell_img = table_img[y0:y1, x0:x1]
+            text = ocr_cell(cell_img)
+            col_text.append(text)
+        column_texts.append(col_text)
+    # ---- Identifier colonne Description ----
+    desc_col = None
+    for col in column_texts:
+        header = col[0].lower() if col else ""
+        if "description" in header:
+            desc_col = col
+            break
+    if desc_col is None:
+        return "❌ Colonne 'Description' non trouvée", "\n\n".join(
+            [" | ".join(col) for col in column_texts]
         )
+    return "\n".join(desc_col[1:]), "\n\n".join(desc_col)
+# ===============================
+# Interface Gradio
+# ===============================
 demo = gr.Interface(
+    fn=extract_description,
     inputs=gr.Image(type="pil", label="Image de facture"),
+    outputs=[
+        gr.Textbox(label="📋 Colonne Description"),
+        gr.Textbox(label="🛠 Debug colonne détectée")
+    ],
+    title="Extraction de la colonne Description (Table Transformer)",
+    description="Détection automatique de la colonne Description dans les tableaux de factures"
 )
+demo.launch()