Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 26, 2025

Commit

6f02e5b

verified ·

1 Parent(s): 010fbd5

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -133

app.py CHANGED Viewed

@@ -1,153 +1,139 @@
 import gradio as gr
-import cv2
 import numpy as np
-import torch
-from PIL import Image
-from transformers import TableTransformerForObjectDetection, AutoImageProcessor
 from paddleocr import PaddleOCR
-from unidecode import unidecode
-# =========================
-# Initialisation modèles
-# =========================
-device = "cpu"
-processor = AutoImageProcessor.from_pretrained(
-    "microsoft/table-transformer-detection"
-)
-model = TableTransformerForObjectDetection.from_pretrained(
-    "microsoft/table-transformer-detection"
-).to(device)
 ocr = PaddleOCR(
-    lang="fr",
-    use_angle_cls=True
 )
-# =========================
-# Utils
-# =========================
-def normalize_text(text):
-    return unidecode(text.lower().strip())
-def preprocess_image(pil_img):
-    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    gray = cv2.adaptiveThreshold(
-        gray, 255,
-        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY, 31, 2
-    )
-    return gray
-# =========================
-# Détection tableau
-# =========================
-def detect_table(pil_img):
-    inputs = processor(images=pil_img, return_tensors="pt")
-    outputs = model(**inputs)
-    target_sizes = torch.tensor([pil_img.size[::-1]])
-    results = processor.post_process_object_detection(
-        outputs,
-        threshold=0.7,
-        target_sizes=target_sizes
-    )[0]
-    for score, label, box in zip(
-        results["scores"],
-        results["labels"],
-        results["boxes"]
-    ):
-        if model.config.id2label[label.item()] == "table":
-            return [int(x) for x in box.tolist()]
-    return None
-# =========================
-# OCR complet image
-# =========================
-def run_ocr(img):
-    result = ocr.ocr(img, cls=True)
-    lines = []
-    for block in result:
-        for line in block:
-            bbox, (text, _) = line
-            lines.append((bbox, text))
-    return lines
-# =========================
-# Extraction colonne Désignations
-# =========================
-def extract_designations(pil_img):
-    table_box = detect_table(pil_img)
-    if table_box is None:
-        return "❌ Aucun tableau détecté", []
-    x1, y1, x2, y2 = table_box
-    img = preprocess_image(pil_img)
-    table_img = img[y1:y2, x1:x2]
-    ocr_lines = run_ocr(table_img)
-    # Regrouper lignes par hauteur (approx colonnes)
-    columns = {}
-    for bbox, text in ocr_lines:
-        x_coords = [p[0] for p in bbox]
-        x_center = int(sum(x_coords) / len(x_coords))
-        if x_center not in columns:
-            columns[x_center] = []
-        columns[x_center].append(text)
-    # Trier colonnes de gauche à droite
-    sorted_cols = sorted(columns.items(), key=lambda x: x[0])
-    designation_col = None
-    for _, texts in sorted_cols:
-        header = normalize_text(" ".join(texts[:2]))
-        if any(k in header for k in [
-            "designation", "designation des travaux",
-            "libelle", "description"
-        ]):
-            designation_col = texts[1:]  # skip header
-            break
-    if designation_col is None:
-        return "❌ Colonne Désignations non trouvée", []
-    cleaned = [t for t in designation_col if len(t.strip()) > 2]
-    return "✅ Extraction réussie", cleaned
-# =========================
-# Gradio UI
-# =========================
-def process(image):
-    status, designations = extract_designations(image)
-    return status, "\n".join(designations)
-with gr.Blocks() as demo:
-    gr.Markdown("## 📄 Extraction de la colonne **Désignations**")
-    image_input = gr.Image(type="pil", label="Uploader une image")
-    status = gr.Textbox(label="Statut")
-    output = gr.Textbox(label="Désignations extraites", lines=15)
-    btn = gr.Button("Extraire")
-    btn.click(
-        process,
-        inputs=image_input,
-        outputs=[status, output]
     )
-demo.queue().launch(server_name="0.0.0.0",server_port=7860)

 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
+from sklearn.cluster import KMeans
+# -----------------------------
+# OCR
+# -----------------------------
 ocr = PaddleOCR(
+    use_textline_orientation=True,
+    lang="fr"
 )
+# -----------------------------
+# Fonction principale
+# -----------------------------
+def extract_column2_9_lines(image):
+    if image is None:
+        return "Aucune image fournie."
+    img = np.array(image)
+    result = ocr.predict(img)
+    if not result or len(result) == 0:
+        return "OCR exécuté mais aucun texte détecté."
+    data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
+    if not texts:
+        return "Aucun texte exploitable détecté."
+    # -----------------------------
+    # 1. Collecte OCR
+    # -----------------------------
+    elements = []
+    for text, box in zip(texts, boxes):
+        text = text.strip()
+        if len(text) < 3:
+            continue
+        x_center = np.mean([p[0] for p in box])
+        y_center = np.mean([p[1] for p in box])
+        elements.append((x_center, y_center, text))
+    if len(elements) < 5:
+        return "Pas assez de texte détecté."
+    # -----------------------------
+    # 2. Clustering horizontal ADAPTATIF
+    # -----------------------------
+    X = np.array([[e[0]] for e in elements])
+    n_clusters = min(8, max(3, len(elements) // 8))
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X)
+    columns = {}
+    for (x, y, text), label in zip(elements, labels):
+        columns.setdefault(label, []).append((x, y, text))
+    # -----------------------------
+    # 3. Choisir la colonne "Description"
+    # => la plus riche en texte non numérique
+    # -----------------------------
+    def column_score(col):
+        score = 0
+        for _, _, t in col:
+            if not any(char.isdigit() for char in t):
+                score += len(t)
+        return score
+    best_column = max(columns.values(), key=column_score)
+    # Tri vertical
+    best_column.sort(key=lambda e: e[1])
+    # -----------------------------
+    # 4. Fusion intelligente des lignes
+    # -----------------------------
+    merged_lines = []
+    current_text = ""
+    last_y = None
+    Y_THRESHOLD = 22
+    blacklist = (
+        "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
+        "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
     )
+    for _, y, text in best_column:
+        if text.upper().startswith(blacklist):
+            continue
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
+            if current_text:
+                merged_lines.append(current_text.strip())
+            current_text = text
+        else:
+            current_text += " " + text
+        last_y = y
+    if current_text:
+        merged_lines.append(current_text.strip())
+    # -----------------------------
+    # 5. Nettoyage final
+    # -----------------------------
+    cleaned = []
+    for line in merged_lines:
+        if len(line) < 5:
+            continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
+            continue
+        cleaned.append(line)
+    final_lines = cleaned[:9]
+    if not final_lines:
+        return "Colonne détectée mais contenu non exploitable."
+    # Numérotation demandée
+    return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
+# -----------------------------
+# Interface Gradio
+# -----------------------------
+demo = gr.Interface(
+    fn=extract_column2_9_lines,
+    inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
+    title="Extraction robuste de la colonne Description",
+    description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
+)
+demo.launch(server_name="0.0.0.0", server_port=7860)