Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 30, 2025

Commit

30b2704

verified ·

1 Parent(s): e6d8b93

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -134

app.py CHANGED Viewed

@@ -1,144 +1,55 @@
 import gradio as gr
-import torch
-import cv2
-import pytesseract
-import numpy as np
 from PIL import Image
-from transformers import DetrImageProcessor, TableTransformerForObjectDetection
 # ===============================
-# Chargement des modèles
 # ===============================
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Détection de tableau
-det_processor = DetrImageProcessor.from_pretrained(
-    "microsoft/table-transformer-detection"
-)
-det_model = TableTransformerForObjectDetection.from_pretrained(
-    "microsoft/table-transformer-detection"
-).to(DEVICE)
-# Structure (cellules)
-struct_processor = DetrImageProcessor.from_pretrained(
-    "microsoft/table-transformer-structure-recognition"
-)
-struct_model = TableTransformerForObjectDetection.from_pretrained(
-    "microsoft/table-transformer-structure-recognition"
-).to(DEVICE)
-# ===============================
-# OCR cellule
-# ===============================
-def ocr_cell(image):
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    text = pytesseract.image_to_string(gray, config="--psm 6")
-    return text.strip()
 # ===============================
-# Fonction principale
 # ===============================
 def extract_description(image_pil):
-    # Convertir PIL -> np.array
-    image = np.array(image_pil)
-    h, w, _ = image.shape
-    # ---- Détection du tableau ----
-    inputs = det_processor(images=image, return_tensors="pt")
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
-    outputs = det_model(**inputs)
-    results = det_processor.post_process_object_detection(
-        outputs,
-        threshold=0.8,
-        target_sizes=[(h, w)]
-    )[0]
-    tables = [
-        box for box, label in zip(results["boxes"], results["labels"])
-        if det_model.config.id2label[label.item()] == "table"
-    ]
-    if not tables:
-        return "❌ Aucun tableau détecté", ""
-    # Extraire premier tableau détecté
-    table_box = tables[0].int().tolist()
-    x0, y0, x1, y1 = table_box
-    table_img = image[y0:y1, x0:x1]
-    # ---- Optionnel : vérifier visuellement le tableau ----
-    # Image.fromarray(table_img).show()
-    # ---- Redimensionner le tableau pour la structure ----
-    max_size = 1024
-    scale = max(table_img.shape[:2]) / max_size
-    new_w = int(table_img.shape[1] / scale)
-    new_h = int(table_img.shape[0] / scale)
-    table_resized = cv2.resize(table_img, (new_w, new_h))
-    # ---- Structure du tableau ----
-    inputs = struct_processor(images=table_resized, return_tensors="pt")
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
-    outputs = struct_model(**inputs)
-    results = struct_processor.post_process_object_detection(
-        outputs,
-        threshold=0.5,  # seuil abaissé pour capturer plus de cellules
-        target_sizes=[table_resized.shape[:2]]
-    )[0]
-    cells = []
-    for box, label in zip(results["boxes"], results["labels"]):
-        label_name = struct_model.config.id2label[label.item()]
-        if label_name == "table cell":
-            # Remettre les coordonnées à l'échelle originale
-            scale_x = table_img.shape[1] / table_resized.shape[1]
-            scale_y = table_img.shape[0] / table_resized.shape[0]
-            x0c, y0c, x1c, y1c = box.int().tolist()
-            x0c = int(x0c * scale_x)
-            x1c = int(x1c * scale_x)
-            y0c = int(y0c * scale_y)
-            y1c = int(y1c * scale_y)
-            cells.append([x0c, y0c, x1c, y1c])
-    if not cells:
-        return "❌ Aucune cellule détectée", ""
-    # ---- Grouper par colonne (X) ----
-    cells_sorted = sorted(cells, key=lambda b: (b[0] + b[2]) / 2)
-    columns = {}
-    for cell in cells_sorted:
-        cx = (cell[0] + cell[2]) // 2
-        columns.setdefault(cx // 50, []).append(cell)
-    columns = list(columns.values())
-    columns = sorted(columns, key=lambda col: np.mean([(c[0]+c[2])/2 for c in col]))
-    # ---- OCR par colonne ----
-    column_texts = []
-    for col in columns:
-        col_text = []
-        for x0, y0, x1, y1 in sorted(col, key=lambda b: b[1]):
-            cell_img = table_img[y0:y1, x0:x1]
-            text = ocr_cell(cell_img)
-            col_text.append(text)
-        column_texts.append(col_text)
-    # ---- Identifier colonne Description ----
-    desc_col = None
-    for col in column_texts:
-        header = col[0].lower() if col else ""
-        if "description" in header:
-            desc_col = col
-            break
-    if desc_col is None:
-        return "❌ Colonne 'Description' non trouvée", "\n\n".join(
-            [f"Col {i}: " + " | ".join(col) for i, col in enumerate(column_texts)]
-        )
-    return "\n".join(desc_col[1:]), "\n\n".join(desc_col)
 # ===============================
 # Interface Gradio
@@ -148,10 +59,10 @@ demo = gr.Interface(
     inputs=gr.Image(type="pil", label="Image de facture"),
     outputs=[
         gr.Textbox(label="📋 Colonne Description"),
-        gr.Textbox(label="🛠 Debug colonne détectée")
     ],
-    title="Extraction de la colonne Description (Table Transformer)",
-    description="Détection automatique de la colonne Description dans les tableaux de factures"
 )
 demo.launch()

 import gradio as gr
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from PIL import Image
+import re
 # ===============================
+# Charger le modèle pré-entraîné TrOCR
 # ===============================
+model_name = "microsoft/trocr-base-table-finetuned"  # Spécial tables
+processor = TrOCRProcessor.from_pretrained(model_name)
+model = VisionEncoderDecoderModel.from_pretrained(model_name)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
 # ===============================
+# Fonction d'extraction de la colonne Description
 # ===============================
 def extract_description(image_pil):
+    # OCR avec TrOCR
+    pixel_values = processor(images=image_pil, return_tensors="pt").pixel_values.to(device)
+    generated_ids = model.generate(pixel_values)
+    ocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Séparer le texte en lignes
+    lines = [line.strip() for line in ocr_text.split("\n") if line.strip()]
+    # Identifier la colonne Description
+    desc_col = []
+    header_found = False
+    headers = []
+    # Détecter les headers possibles
+    if lines:
+        first_line = lines[0].lower()
+        # Split en colonnes par tabulation ou espaces multiples
+        headers = re.split(r"\t+|\s{2,}", first_line)
+        try:
+            desc_index = next(i for i, h in enumerate(headers) if "description" in h.lower())
+            header_found = True
+        except StopIteration:
+            desc_index = None
+    # Extraire les valeurs sous la colonne Description
+    if header_found:
+        for line in lines[1:]:
+            cols = re.split(r"\t+|\s{2,}", line)
+            if desc_index is not None and desc_index < len(cols):
+                desc_col.append(cols[desc_index])
+    else:
+        return "❌ Colonne 'Description' non trouvée", ocr_text
+    return "\n".join(desc_col), ocr_text
 # ===============================
 # Interface Gradio
     inputs=gr.Image(type="pil", label="Image de facture"),
     outputs=[
         gr.Textbox(label="📋 Colonne Description"),
+        gr.Textbox(label="🛠 OCR complet pour debug")
     ],
+    title="Extraction de la colonne Description (TrOCR + tables)",
+    description="Détection automatique de la colonne Description dans les factures avec TrOCR"
 )
 demo.launch()