Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 22, 2025

Commit

d932601

verified ·

1 Parent(s): d539c06

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -21

app.py CHANGED Viewed

@@ -1,14 +1,25 @@
 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
-from sklearn.cluster import KMeans
 import re
 ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
 # -------------------------------------------------
-# OUTILS TEXTE
 # -------------------------------------------------
 def is_continuation(text):
     t = text.lower().strip()
     return (
@@ -16,17 +27,19 @@ def is_continuation(text):
         or t.startswith("avec ")
         or t.startswith("y compris")
         or t.startswith("compr")
     )
 def has_too_many_digits(text):
     return sum(c.isdigit() for c in text) > len(text) * 0.4
 def looks_like_designation(text):
-    if len(text) < 10:
         return False
     if has_too_many_digits(text):
         return False
-    if re.match(r"^(m2|m3|ml|u|ff)\b", text.lower()):
         return False
     return True
@@ -38,35 +51,45 @@ def extract_designations(image):
         return "Aucune image fournie."
     img = np.array(image)
-    result = ocr.predict(img)
-    data = result[0]
-    texts = data["rec_texts"]
-    boxes = data["dt_polys"]
     lines = []
     for text, box in zip(texts, boxes):
-        text = text.strip()
         y = np.mean([p[1] for p in box])
-        lines.append((y, text))
-    # Tri vertical
     lines.sort(key=lambda x: x[0])
-    # Suppression en-tête
-    filtered = []
-    for y, text in lines:
-        if text.upper().strip() == "DESIGNATIONS":
             continue
-        filtered.append(text)
     # -------------------------------------------------
-    # FUSION INTELLIGENTE
     # -------------------------------------------------
     cells = []
     current = ""
-    for text in filtered:
         if not looks_like_designation(text):
             continue
@@ -76,7 +99,7 @@ def extract_designations(image):
         if is_continuation(text):
             current += " " + text
-        elif text[0].isupper() and len(text) > 20:
             cells.append(current.strip())
             current = text
         else:
@@ -85,6 +108,7 @@ def extract_designations(image):
     if current:
         cells.append(current.strip())
     cells = cells[:9]
     if not cells:
@@ -99,8 +123,8 @@ demo = gr.Interface(
     fn=extract_designations,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
-    title="Extraction fiable de la colonne DESIGNATIONS",
-    description="Approche textuelle robuste pour devis et bordereaux"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import numpy as np
 from paddleocr import PaddleOCR
 import re
 ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
 # -------------------------------------------------
+# FILTRES MÉTIER
 # -------------------------------------------------
+def is_title(text):
+    t = text.upper()
+    keywords = [
+        "CADRE DE DEVIS",
+        "LOT",
+        "AXE",
+        "PRIX TOTAL",
+        "TVA",
+        "TTC"
+    ]
+    return any(k in t for k in keywords)
 def is_continuation(text):
     t = text.lower().strip()
     return (
         or t.startswith("avec ")
         or t.startswith("y compris")
         or t.startswith("compr")
+        or t.startswith("pour ")
+        or t.startswith("épaisseur")
     )
 def has_too_many_digits(text):
     return sum(c.isdigit() for c in text) > len(text) * 0.4
 def looks_like_designation(text):
+    if len(text) < 8:
         return False
     if has_too_many_digits(text):
         return False
+    if re.match(r"^(m2|m3|ml|ff|u)\b", text.lower()):
         return False
     return True
         return "Aucune image fournie."
     img = np.array(image)
+    result = ocr.predict(img)[0]
+    texts = result["rec_texts"]
+    boxes = result["dt_polys"]
     lines = []
     for text, box in zip(texts, boxes):
         y = np.mean([p[1] for p in box])
+        lines.append((y, text.strip()))
+    # tri vertical
     lines.sort(key=lambda x: x[0])
+    # -------------------------------------------------
+    # ON COMMENCE APRÈS "DESIGNATIONS"
+    # -------------------------------------------------
+    started = False
+    cleaned = []
+    for _, text in lines:
+        if text.upper() == "DESIGNATIONS":
+            started = True
             continue
+        if not started:
+            continue
+        if is_title(text):
+            continue
+        cleaned.append(text)
     # -------------------------------------------------
+    # RECONSTRUCTION DES CELLULES
     # -------------------------------------------------
     cells = []
     current = ""
+    for text in cleaned:
         if not looks_like_designation(text):
             continue
         if is_continuation(text):
             current += " " + text
+        elif text[0].isupper():
             cells.append(current.strip())
             current = text
         else:
     if current:
         cells.append(current.strip())
+    # Limite à 9 lignes (LOT 1)
     cells = cells[:9]
     if not cells:
     fn=extract_designations,
     inputs=gr.Image(type="pil", label="Image du tableau"),
     outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
+    title="Extraction fiable de la colonne DESIGNATIONS (V3)",
+    description="Filtrage métier + reconstruction intelligente des cellules"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)