Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

f46ff1b

verified ·

1 Parent(s): 7638b6f

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -150

app.py CHANGED Viewed

@@ -1,162 +1,153 @@
 import gradio as gr
 import numpy as np
-import unicodedata
 from paddleocr import PaddleOCR
-from sklearn.cluster import KMeans
-# -------------------------------------------------
-# OCR
-# -------------------------------------------------
 ocr = PaddleOCR(
     lang="fr",
-    use_textline_orientation=True
 )
-# -------------------------------------------------
-# Normalisation texte (casse + accents)
-# -------------------------------------------------
-def normalize(text: str) -> str:
-    text = text.lower()
-    text = unicodedata.normalize("NFD", text)
-    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
-    return " ".join(text.split())
-# -------------------------------------------------
-# Titres valides de la colonne 2
-# -------------------------------------------------
-COL_TITLES = {
-    "designation",
-    "designations",
-    "description",
-    "description des services"
-}
-# -------------------------------------------------
-# Mots / lignes à ignorer
-# -------------------------------------------------
-IGNORE_KEYWORDS = {
-    "prix", "total", "ht", "htva", "tva",
-    "ttc", "general", "generale"
-}
-# -------------------------------------------------
-# Fonction principale
-# -------------------------------------------------
-def extract_second_column(image):
-    if image is None:
-        return "Aucune image fournie."
-    img = np.array(image)
-    result = ocr.predict(img)
-    if not result:
-        return "OCR : aucun texte détecté."
-    data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
-    blocks = []
-    for text, box in zip(texts, boxes):
-        t = text.strip()
-        if len(t) < 2:
-            continue
-        x = np.mean([p[0] for p in box])
-        y = np.mean([p[1] for p in box])
-        blocks.append((t, x, y))
-    if len(blocks) < 5:
-        return "Pas assez de texte exploitable."
-    # -------------------------------------------------
-    # 1. Détection du X de la colonne cible via son titre
-    # -------------------------------------------------
-    col_x = None
-    for text, x, y in blocks:
-        if normalize(text) in COL_TITLES:
-            col_x = x
             break
-    if col_x is None:
-        return "Titre de la colonne cible non détecté."
-    # -------------------------------------------------
-    # 2. Sélection des blocs proches du X détecté
-    # -------------------------------------------------
-    X_THRESHOLD = 45
-    column_blocks = [
-        (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < X_THRESHOLD
-    ]
-    if not column_blocks:
-        return "Colonne détectée mais vide."
-    # -------------------------------------------------
-    # 3. Tri vertical (haut → bas)
-    # -------------------------------------------------
-    column_blocks.sort(key=lambda e: e[2])
-    # -------------------------------------------------
-    # 4. Fusion intelligente des lignes OCR
-    # -------------------------------------------------
-    merged = []
-    current = ""
-    last_y = None
-    Y_THRESHOLD = 22
-    for text, x, y in column_blocks:
-        nt = normalize(text)
-        if any(k in nt for k in IGNORE_KEYWORDS):
-            continue
-        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
-            if current:
-                merged.append(current.strip())
-            current = text
-        else:
-            current += " " + text
-        last_y = y
-    if current:
-        merged.append(current.strip())
-    # -------------------------------------------------
-    # 5. Nettoyage final (cellules texte uniquement)
-    # -------------------------------------------------
-    final = []
-    for line in merged:
-        nt = normalize(line)
-        if len(nt) < 4:
-            continue
-        if sum(c.isdigit() for c in line) > len(line) / 2:
-            continue
-        final.append(line)
-    if not final:
-        return "Aucune cellule texte valide trouvée."
-    # -------------------------------------------------
-    # 6. Résultat numéroté
-    # -------------------------------------------------
-    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
-# -------------------------------------------------
-# Interface Gradio
-# -------------------------------------------------
-demo = gr.Interface(
-    fn=extract_second_column,
-    inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu de la colonne 2"),
-    title="Extraction fiable de la colonne 2 (Désignation / Description)",
-    description=(
-        "Extraction robuste de la deuxième colonne des tableaux scannés "
-        "(Désignation, DESIGNATIONS, Description, Description des services)."
     )
-)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import cv2
 import numpy as np
+import torch
+from PIL import Image
+from transformers import TableTransformerForObjectDetection, AutoImageProcessor
 from paddleocr import PaddleOCR
+from unidecode import unidecode
+# =========================
+# Initialisation modèles
+# =========================
+device = "cpu"
+processor = AutoImageProcessor.from_pretrained(
+    "microsoft/table-transformer-detection"
+)
+model = TableTransformerForObjectDetection.from_pretrained(
+    "microsoft/table-transformer-detection"
+).to(device)
 ocr = PaddleOCR(
     lang="fr",
+    use_angle_cls=True,
+    show_log=False
 )
+# =========================
+# Utils
+# =========================
+def normalize_text(text):
+    return unidecode(text.lower().strip())
+def preprocess_image(pil_img):
+    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.adaptiveThreshold(
+        gray, 255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY, 31, 2
+    )
+    return gray
+# =========================
+# Détection tableau
+# =========================
+def detect_table(pil_img):
+    inputs = processor(images=pil_img, return_tensors="pt")
+    outputs = model(**inputs)
+    target_sizes = torch.tensor([pil_img.size[::-1]])
+    results = processor.post_process_object_detection(
+        outputs,
+        threshold=0.7,
+        target_sizes=target_sizes
+    )[0]
+    for score, label, box in zip(
+        results["scores"],
+        results["labels"],
+        results["boxes"]
+    ):
+        if model.config.id2label[label.item()] == "table":
+            return [int(x) for x in box.tolist()]
+    return None
+# =========================
+# OCR complet image
+# =========================
+def run_ocr(img):
+    result = ocr.ocr(img, cls=True)
+    lines = []
+    for block in result:
+        for line in block:
+            bbox, (text, _) = line
+            lines.append((bbox, text))
+    return lines
+# =========================
+# Extraction colonne Désignations
+# =========================
+def extract_designations(pil_img):
+    table_box = detect_table(pil_img)
+    if table_box is None:
+        return "❌ Aucun tableau détecté", []
+    x1, y1, x2, y2 = table_box
+    img = preprocess_image(pil_img)
+    table_img = img[y1:y2, x1:x2]
+    ocr_lines = run_ocr(table_img)
+    # Regrouper lignes par hauteur (approx colonnes)
+    columns = {}
+    for bbox, text in ocr_lines:
+        x_coords = [p[0] for p in bbox]
+        x_center = int(sum(x_coords) / len(x_coords))
+        if x_center not in columns:
+            columns[x_center] = []
+        columns[x_center].append(text)
+    # Trier colonnes de gauche à droite
+    sorted_cols = sorted(columns.items(), key=lambda x: x[0])
+    designation_col = None
+    for _, texts in sorted_cols:
+        header = normalize_text(" ".join(texts[:2]))
+        if any(k in header for k in [
+            "designation", "designation des travaux",
+            "libelle", "description"
+        ]):
+            designation_col = texts[1:]  # skip header
             break
+    if designation_col is None:
+        return "❌ Colonne Désignations non trouvée", []
+    cleaned = [t for t in designation_col if len(t.strip()) > 2]
+    return "✅ Extraction réussie", cleaned
+# =========================
+# Gradio UI
+# =========================
+def process(image):
+    status, designations = extract_designations(image)
+    return status, "\n".join(designations)
+with gr.Blocks() as demo:
+    gr.Markdown("## 📄 Extraction de la colonne **Désignations**")
+    image_input = gr.Image(type="pil", label="Uploader une image")
+    status = gr.Textbox(label="Statut")
+    output = gr.Textbox(label="Désignations extraites", lines=15)
+    btn = gr.Button("Extraire")
+    btn.click(
+        process,
+        inputs=image_input,
+        outputs=[status, output]
     )
+demo.launch(server_name="0.0.0.0",server_port=7860)