Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

a1e0d1a

verified ·

1 Parent(s): 7e77d30

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -92

app.py CHANGED Viewed

@@ -1,139 +1,162 @@
 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
-# -----------------------------
 # OCR
-# -----------------------------
 ocr = PaddleOCR(
-    use_textline_orientation=True,
-    lang="fr"
 )
-# -----------------------------
 # Fonction principale
-# -----------------------------
-def extract_column2_9_lines(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
-    if not result or len(result) == 0:
-        return "OCR exécuté mais aucun texte détecté."
     data = result[0]
     texts = data.get("rec_texts", [])
     boxes = data.get("dt_polys", [])
-    if not texts:
-        return "Aucun texte exploitable détecté."
-    # -----------------------------
-    # 1. Collecte OCR
-    # -----------------------------
-    elements = []
     for text, box in zip(texts, boxes):
-        text = text.strip()
-        if len(text) < 3:
             continue
-        x_center = np.mean([p[0] for p in box])
-        y_center = np.mean([p[1] for p in box])
-        elements.append((x_center, y_center, text))
-    if len(elements) < 5:
-        return "Pas assez de texte détecté."
-    # -----------------------------
-    # 2. Clustering horizontal ADAPTATIF
-    # -----------------------------
-    X = np.array([[e[0]] for e in elements])
-    n_clusters = min(8, max(3, len(elements) // 8))
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X)
-    columns = {}
-    for (x, y, text), label in zip(elements, labels):
-        columns.setdefault(label, []).append((x, y, text))
-    # -----------------------------
-    # 3. Choisir la colonne "Description"
-    # => la plus riche en texte non numérique
-    # -----------------------------
-    def column_score(col):
-        score = 0
-        for _, _, t in col:
-            if not any(char.isdigit() for char in t):
-                score += len(t)
-        return score
-    best_column = max(columns.values(), key=column_score)
-    # Tri vertical
-    best_column.sort(key=lambda e: e[1])
-    # -----------------------------
-    # 4. Fusion intelligente des lignes
-    # -----------------------------
-    merged_lines = []
-    current_text = ""
     last_y = None
     Y_THRESHOLD = 22
-    blacklist = (
-        "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
-        "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
-    )
-    for _, y, text in best_column:
-        if text.upper().startswith(blacklist):
             continue
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
-            if current_text:
-                merged_lines.append(current_text.strip())
-            current_text = text
         else:
-            current_text += " " + text
         last_y = y
-    if current_text:
-        merged_lines.append(current_text.strip())
-    # -----------------------------
-    # 5. Nettoyage final
-    # -----------------------------
-    cleaned = []
-    for line in merged_lines:
-        if len(line) < 5:
             continue
         if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
-        cleaned.append(line)
-    final_lines = cleaned[:9]
-    if not final_lines:
-        return "Colonne détectée mais contenu non exploitable."
-    # Numérotation demandée
-    return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
-# -----------------------------
 # Interface Gradio
-# -----------------------------
 demo = gr.Interface(
-    fn=extract_column2_9_lines,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
-    title="Extraction robuste de la colonne Description",
-    description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
 )
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import numpy as np
+import unicodedata
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
+# -------------------------------------------------
 # OCR
+# -------------------------------------------------
 ocr = PaddleOCR(
+    lang="fr",
+    use_textline_orientation=True
 )
+# -------------------------------------------------
+# Normalisation texte (casse + accents)
+# -------------------------------------------------
+def normalize(text: str) -> str:
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
+    return " ".join(text.split())
+# -------------------------------------------------
+# Titres valides de la colonne 2
+# -------------------------------------------------
+COL_TITLES = {
+    "designation",
+    "designations",
+    "description",
+    "description des services"
+}
+# -------------------------------------------------
+# Mots / lignes à ignorer
+# -------------------------------------------------
+IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva",
+    "ttc", "general", "generale"
+}
+# -------------------------------------------------
 # Fonction principale
+# -------------------------------------------------
+def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
+    if not result:
+        return "OCR : aucun texte détecté."
     data = result[0]
     texts = data.get("rec_texts", [])
     boxes = data.get("dt_polys", [])
+    blocks = []
     for text, box in zip(texts, boxes):
+        t = text.strip()
+        if len(t) < 2:
             continue
+        x = np.mean([p[0] for p in box])
+        y = np.mean([p[1] for p in box])
+        blocks.append((t, x, y))
+    if len(blocks) < 5:
+        return "Pas assez de texte exploitable."
+    # -------------------------------------------------
+    # 1. Détection du X de la colonne cible via son titre
+    # -------------------------------------------------
+    col_x = None
+    for text, x, y in blocks:
+        if normalize(text) in COL_TITLES:
+            col_x = x
+            break
+    if col_x is None:
+        return "Titre de la colonne cible non détecté."
+    # -------------------------------------------------
+    # 2. Sélection des blocs proches du X détecté
+    # -------------------------------------------------
+    X_THRESHOLD = 45
+    column_blocks = [
+        (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD
+    ]
+    if not column_blocks:
+        return "Colonne détectée mais vide."
+    # -------------------------------------------------
+    # 3. Tri vertical (haut → bas)
+    # -------------------------------------------------
+    column_blocks.sort(key=lambda e: e[2])
+    # -------------------------------------------------
+    # 4. Fusion intelligente des lignes OCR
+    # -------------------------------------------------
+    merged = []
+    current = ""
     last_y = None
     Y_THRESHOLD = 22
+    for text, x, y in column_blocks:
+        nt = normalize(text)
+        if any(k in nt for k in IGNORE_KEYWORDS):
             continue
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
+            if current:
+                merged.append(current.strip())
+            current = text
         else:
+            current += " " + text
         last_y = y
+    if current:
+        merged.append(current.strip())
+    # -------------------------------------------------
+    # 5. Nettoyage final (cellules texte uniquement)
+    # -------------------------------------------------
+    final = []
+    for line in merged:
+        nt = normalize(line)
+        if len(nt) < 4:
             continue
         if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
+        final.append(line)
+    if not final:
+        return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 6. Résultat numéroté
+    # -------------------------------------------------
+    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
+# -------------------------------------------------
 # Interface Gradio
+# -------------------------------------------------
 demo = gr.Interface(
+    fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu de la colonne 2"),
+    title="Extraction fiable de la colonne 2 (Désignation / Description)",
+    description=(
+        "Extraction robuste de la deuxième colonne des tableaux scannés "
+        "(Désignation, DESIGNATIONS, Description, Description des services)."
+    )
 )
+demo.launch(server_name="0.0.0.0", server_port=7860))