Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 23, 2025

Commit

5d96033

verified ·

1 Parent(s): ce6a96f

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -67

app.py CHANGED Viewed

@@ -1,19 +1,47 @@
 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
-# -----------------------------
 # OCR
-# -----------------------------
 ocr = PaddleOCR(
-    use_textline_orientation=True,
-    lang="fr"
 )
-# -----------------------------
-# Extraction de la 2e colonne
-# -----------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
@@ -21,66 +49,70 @@ def extract_second_column(image):
     img = np.array(image)
     result = ocr.predict(img)
-    if not result or len(result[0]["rec_texts"]) == 0:
-        return "OCR exécuté mais aucun texte détecté."
     data = result[0]
-    texts = data["rec_texts"]
-    boxes = data["dt_polys"]
-    elements = []
     for text, box in zip(texts, boxes):
-        text = text.strip()
-        if len(text) < 3:
             continue
-        x_center = np.mean([p[0] for p in box])
-        y_center = np.mean([p[1] for p in box])
-        elements.append((x_center, y_center, text))
-    if len(elements) < 5:
-        return "Pas assez de texte détecté."
-    # -----------------------------
-    # 1. Regroupement en colonnes (par X)
-    # -----------------------------
-    X = np.array([[e[0]] for e in elements])
-    # Nombre de colonnes estimé automatiquement
-    n_cols = min(6, max(2, len(elements) // 6))
-    kmeans = KMeans(n_clusters=n_cols, random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X)
-    columns = {}
-    for (x, y, text), label in zip(elements, labels):
-        columns.setdefault(label, []).append((x, y, text))
-    # Trier les colonnes de gauche à droite
-    sorted_columns = sorted(
-        columns.values(),
-        key=lambda col: np.mean([e[0] for e in col])
-    )
-    if len(sorted_columns) < 2:
-        return "Impossible de détecter la 2e colonne."
-    # -----------------------------
-    # 2. Sélection de la 2e colonne
-    # -----------------------------
-    col = sorted_columns[1]
-    col.sort(key=lambda e: e[1])  # top → bottom
-    # -----------------------------
-    # 3. Fusion verticale (cellules)
-    # -----------------------------
     merged = []
     current = ""
     last_y = None
     Y_THRESHOLD = 22
-    for _, y, text in col:
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
@@ -93,28 +125,38 @@ def extract_second_column(image):
     if current:
         merged.append(current.strip())
-    # -----------------------------
-    # 4. Nettoyage léger
-    # -----------------------------
-    final = [
-        line for line in merged
-        if len(line) > 5
-    ]
     if not final:
-        return "Colonne détectée mais vide."
-    return "\n".join(f"{i+1}. {l}" for i, l in enumerate(final))
-# -----------------------------
 # Interface Gradio
-# -----------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu de la 2e colonne"),
-    title="Extraction de la 2e colonne du tableau",
-    description="La colonne cible est toujours la deuxième (texte uniquement)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import numpy as np
+import unicodedata
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
+# -------------------------------------------------
 # OCR
+# -------------------------------------------------
 ocr = PaddleOCR(
+    lang="fr",
+    use_textline_orientation=True
 )
+# -------------------------------------------------
+# Normalisation texte (casse + accents)
+# -------------------------------------------------
+def normalize(text: str) -> str:
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
+    return " ".join(text.split())
+# -------------------------------------------------
+# Titres valides de la colonne 2
+# -------------------------------------------------
+COL_TITLES = {
+    "designation",
+    "designations",
+    "description",
+    "description des services"
+}
+# -------------------------------------------------
+# Mots / lignes à ignorer
+# -------------------------------------------------
+IGNORE_KEYWORDS = {
+    "prix", "total", "ht", "htva", "tva",
+    "ttc", "general", "generale"
+}
+# -------------------------------------------------
+# Fonction principale
+# -------------------------------------------------
 def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
+    if not result:
+        return "OCR : aucun texte détecté."
     data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
+    blocks = []
     for text, box in zip(texts, boxes):
+        t = text.strip()
+        if len(t) < 2:
             continue
+        x = np.mean([p[0] for p in box])
+        y = np.mean([p[1] for p in box])
+        blocks.append((t, x, y))
+    if len(blocks) < 5:
+        return "Pas assez de texte exploitable."
+    # -------------------------------------------------
+    # 1. Détection du X de la colonne cible via son titre
+    # -------------------------------------------------
+    col_x = None
+    for text, x, y in blocks:
+        if normalize(text) in COL_TITLES:
+            col_x = x
+            break
+    if col_x is None:
+        return "Titre de la colonne cible non détecté."
+    # -------------------------------------------------
+    # 2. Sélection des blocs proches du X détecté
+    # -------------------------------------------------
+    X_THRESHOLD = 45
+    column_blocks = [
+        (t, x, y) for t, x, y in blocks
+        if abs(x - col_x) < X_THRESHOLD
+    ]
+    if not column_blocks:
+        return "Colonne détectée mais vide."
+    # -------------------------------------------------
+    # 3. Tri vertical (haut → bas)
+    # -------------------------------------------------
+    column_blocks.sort(key=lambda e: e[2])
+    # -------------------------------------------------
+    # 4. Fusion intelligente des lignes OCR
+    # -------------------------------------------------
     merged = []
     current = ""
     last_y = None
     Y_THRESHOLD = 22
+    for text, x, y in column_blocks:
+        nt = normalize(text)
+        if any(k in nt for k in IGNORE_KEYWORDS):
+            continue
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
                 merged.append(current.strip())
     if current:
         merged.append(current.strip())
+    # -------------------------------------------------
+    # 5. Nettoyage final (cellules texte uniquement)
+    # -------------------------------------------------
+    final = []
+    for line in merged:
+        nt = normalize(line)
+        if len(nt) < 4:
+            continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
+            continue
+        final.append(line)
     if not final:
+        return "Aucune cellule texte valide trouvée."
+    # -------------------------------------------------
+    # 6. Résultat numéroté
+    # -------------------------------------------------
+    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
+# -------------------------------------------------
 # Interface Gradio
+# -------------------------------------------------
 demo = gr.Interface(
     fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu de la colonne 2"),
+    title="Extraction fiable de la colonne 2 (Désignation / Description)",
+    description=(
+        "Extraction robuste de la deuxième colonne des tableaux scannés "
+        "(Désignation, DESIGNATIONS, Description, Description des services)."
+    )
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)