Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 24, 2025

Commit

7e77d30

verified ·

1 Parent(s): e175021

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -114

app.py CHANGED Viewed

@@ -1,150 +1,139 @@
 import gradio as gr
 import numpy as np
-import unicodedata
 from paddleocr import PaddleOCR
-# -------------------------------------------------
-# OCR (CONFIG STABLE POUR HUGGING FACE)
-# -------------------------------------------------
 ocr = PaddleOCR(
-    lang="fr",
-    use_angle_cls=False,            # ⛔ désactivation orientation
-    show_log=False                  # silence logs
 )
-# -------------------------------------------------
-# Normalisation texte
-# -------------------------------------------------
-def normalize(text: str) -> str:
-    text = text.lower()
-    text = unicodedata.normalize("NFD", text)
-    text = "".join(c for c in text if unicodedata.category(c) != "Mn")
-    return " ".join(text.split())
-# -------------------------------------------------
-# Titres colonne 2
-# -------------------------------------------------
-COL_TITLES = {
-    "designation",
-    "designations",
-    "description",
-    "description des services"
-}
-# -------------------------------------------------
-# Mots à ignorer
-# -------------------------------------------------
-IGNORE_KEYWORDS = {
-    "prix", "total", "ht", "htva", "tva",
-    "ttc", "general", "generale"
-}
-# -------------------------------------------------
-# Extraction colonne 2
-# -------------------------------------------------
-def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
-    result = ocr.ocr(img, cls=False)
-    if not result or not result[0]:
-        return "OCR : aucun texte détecté."
-    blocks = []
-    for line in result[0]:
-        text = line[1][0].strip()
-        box = line[0]
-        if len(text) < 2:
             continue
-        x = np.mean([p[0] for p in box])
-        y = np.mean([p[1] for p in box])
-        blocks.append((text, x, y))
-    # -------------------------------------------------
-    # 1. Trouver le titre
-    # -------------------------------------------------
-    col_x, title_y = None, None
-    for text, x, y in blocks:
-        if normalize(text) in COL_TITLES:
-            col_x, title_y = x, y
-            break
-    if col_x is None:
-        return "Titre de la colonne non détecté."
-    # -------------------------------------------------
-    # 2. Filtrage par X + sous le titre
-    # -------------------------------------------------
-    column_blocks = [
-        (t, x, y) for t, x, y in blocks
-        if abs(x - col_x) < 50 and y > title_y + 15
-    ]
-    column_blocks.sort(key=lambda e: e[2])
-    # -------------------------------------------------
-    # 3. Fusion contrôlée
-    # -------------------------------------------------
-    merged = []
-    current = ""
     last_y = None
-    for text, x, y in column_blocks:
-        nt = normalize(text)
-        if any(k in nt for k in IGNORE_KEYWORDS):
             continue
-        new_cell = (
-            last_y is None
-            or abs(y - last_y) > 35
-            or text[0].isupper()
-        )
-        if new_cell:
-            if current:
-                merged.append(current.strip())
-            current = text
         else:
-            current += " " + text
         last_y = y
-    if current:
-        merged.append(current.strip())
-    # -------------------------------------------------
-    # 4. Nettoyage final
-    # -------------------------------------------------
-    final = []
-    for line in merged:
-        if not line[0].isupper():
             continue
-        if sum(c.isdigit() for c in line) > len(line) * 0.4:
             continue
-        final.append(line)
-    if not final:
-        return "Aucune cellule valide trouvée."
-    return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
-# -------------------------------------------------
 # Interface Gradio
-# -------------------------------------------------
 demo = gr.Interface(
-    fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Contenu colonne 2"),
-    title="Extraction colonne Désignation / Description"
 )
-demo.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    ssr_mode=False
-)

 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
+from sklearn.cluster import KMeans
+# -----------------------------
+# OCR
+# -----------------------------
 ocr = PaddleOCR(
+    use_textline_orientation=True,
+    lang="fr"
 )
+# -----------------------------
+# Fonction principale
+# -----------------------------
+def extract_column2_9_lines(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
+    result = ocr.predict(img)
+    if not result or len(result) == 0:
+        return "OCR exécuté mais aucun texte détecté."
+    data = result[0]
+    texts = data.get("rec_texts", [])
+    boxes = data.get("dt_polys", [])
+    if not texts:
+        return "Aucun texte exploitable détecté."
+    # -----------------------------
+    # 1. Collecte OCR
+    # -----------------------------
+    elements = []
+    for text, box in zip(texts, boxes):
+        text = text.strip()
+        if len(text) < 3:
             continue
+        x_center = np.mean([p[0] for p in box])
+        y_center = np.mean([p[1] for p in box])
+        elements.append((x_center, y_center, text))
+    if len(elements) < 5:
+        return "Pas assez de texte détecté."
+    # -----------------------------
+    # 2. Clustering horizontal ADAPTATIF
+    # -----------------------------
+    X = np.array([[e[0]] for e in elements])
+    n_clusters = min(8, max(3, len(elements) // 8))
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X)
+    columns = {}
+    for (x, y, text), label in zip(elements, labels):
+        columns.setdefault(label, []).append((x, y, text))
+    # -----------------------------
+    # 3. Choisir la colonne "Description"
+    # => la plus riche en texte non numérique
+    # -----------------------------
+    def column_score(col):
+        score = 0
+        for _, _, t in col:
+            if not any(char.isdigit() for char in t):
+                score += len(t)
+        return score
+    best_column = max(columns.values(), key=column_score)
+    # Tri vertical
+    best_column.sort(key=lambda e: e[1])
+    # -----------------------------
+    # 4. Fusion intelligente des lignes
+    # -----------------------------
+    merged_lines = []
+    current_text = ""
     last_y = None
+    Y_THRESHOLD = 22
+    blacklist = (
+        "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
+        "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
+    )
+    for _, y, text in best_column:
+        if text.upper().startswith(blacklist):
             continue
+        if last_y is None or abs(y - last_y) > Y_THRESHOLD:
+            if current_text:
+                merged_lines.append(current_text.strip())
+            current_text = text
         else:
+            current_text += " " + text
         last_y = y
+    if current_text:
+        merged_lines.append(current_text.strip())
+    # -----------------------------
+    # 5. Nettoyage final
+    # -----------------------------
+    cleaned = []
+    for line in merged_lines:
+        if len(line) < 5:
             continue
+        if sum(c.isdigit() for c in line) > len(line) / 2:
             continue
+        cleaned.append(line)
+    final_lines = cleaned[:9]
+    if not final_lines:
+        return "Colonne détectée mais contenu non exploitable."
+    # Numérotation demandée
+    return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
+# -----------------------------
 # Interface Gradio
+# -----------------------------
 demo = gr.Interface(
+    fn=extract_column2_9_lines,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
+    title="Extraction robuste de la colonne Description",
+    description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
 )
+demo.launch(server_name="0.0.0.0", server_port=7860)