Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 18, 2025

Commit

35f4e3c

verified ·

1 Parent(s): 4385c86

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -46

app.py CHANGED Viewed

@@ -1,77 +1,128 @@
 import os
 import pandas as pd
-from paddleocr import PaddleOCR
 import gradio as gr
-def extract_second_column(image_path, ocr):
     result = ocr.ocr(image_path, cls=True)
     if not result or not result[0]:
         return []
-    boxes = result[0]
-    boxes_sorted_x = sorted(
-        boxes,
-        key=lambda b: min(p[0] for p in b[0])
-    )
-    columns = {}
-    for box in boxes_sorted_x:
-        x_center = sum(p[0] for p in box[0]) / 4
-        columns.setdefault(int(x_center // 120), []).append(box)
-    if len(columns) < 2:
-        return []
-    sorted_cols = sorted(columns.items())
-    second_col = sorted_cols[1][1]
-    second_col = sorted(
-        second_col,
-        key=lambda b: min(p[1] for p in b[0])
-    )
-    return [b[1][0] for b in second_col]
-def run_ocr():
     images_dir = "images"
-    output_dir = "/data"
-    os.makedirs(output_dir, exist_ok=True)
-    # ⚠️ OCR INITIALISÉ ICI (PAS AU DÉMARRAGE)
-    ocr = PaddleOCR(
-        use_angle_cls=True,
-        lang="fr",
-        show_log=False,
-        cpu_threads=1
-    )
     all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
-            values = extract_second_column(image_path, ocr)
-            for v in values:
                 all_results.append({
                     "image": filename,
-                    "colonne_2": v
                 })
     df = pd.DataFrame(all_results)
-    output_path = os.path.join(output_dir, "resultats_colonne_2.csv")
-    df.to_csv(output_path, index=False,sep=";",encoding="utf-8-sig")
-    return output_path
-gr.Interface(
-    fn=run_ocr,
-    inputs=[],
-    outputs=gr.File(label="Télécharger le CSV"),
-    title="Extraction OCR – Colonne 2",
-    description="Cliquez pour lancer l'OCR (première exécution plus lente)"
-).launch(server_name="0.0.0.0",server_port=7860,allowed_paths=["/data"])

 import os
+import re
 import pandas as pd
 import gradio as gr
+from paddleocr import PaddleOCR
+# =============================
+# Initialisation OCR
+# =============================
+ocr = PaddleOCR(
+    lang="fr",
+    use_angle_cls=True,
+    show_log=False
+)
+# =============================
+# Fonctions utilitaires
+# =============================
+def is_textual(text):
+    """
+    Retourne True si le texte contient au moins une lettre
+    (donc pas uniquement des chiffres ou montants)
+    """
+    text = text.strip()
+    return bool(re.search(r"[A-Za-zÀ-ÿ]", text))
+def extract_second_column_text(image_path):
+    """
+    Extrait les textes OCR situés dans la 2ᵉ colonne logique
+    et conserve uniquement les textes (pas de chiffres)
+    """
     result = ocr.ocr(image_path, cls=True)
     if not result or not result[0]:
         return []
+    elements = []
+    # Collecte des boxes et textes
+    for line in result[0]:
+        box = line[0]
+        text = line[1][0]
+        # position X moyenne du bloc
+        x_center = sum([p[0] for p in box]) / 4
+        elements.append({
+            "x": x_center,
+            "text": text.strip()
+        })
+    # Trier par position horizontale
+    elements = sorted(elements, key=lambda x: x["x"])
+    # Supposer que la 1ʳᵉ colonne est la plus à gauche
+    min_x = elements[0]["x"]
+    # Tout ce qui est suffisamment à droite = colonne 2
+    column_2 = [
+        e["text"]
+        for e in elements
+        if e["x"] > min_x + 50 and is_textual(e["text"])
+    ]
+    return column_2
+# =============================
+# Fonction principale
+# =============================
+def run_extraction():
     images_dir = "images"
+    if not os.path.exists(images_dir):
+        return "❌ Dossier 'images' introuvable", None
     all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
+            texts = extract_second_column_text(image_path)
+            for t in texts:
                 all_results.append({
                     "image": filename,
+                    "colonne_2": t
                 })
+    if not all_results:
+        return "⚠️ Aucun texte détecté", None
     df = pd.DataFrame(all_results)
+    output_path = "/tmp/resultats_colonne_2.csv"
+    df.to_csv(
+        output_path,
+        index=False,
+        sep=";",
+        encoding="utf-8-sig"
+    )
+    return "✅ Extraction terminée avec succès", output_path
+# =============================
+# Interface Gradio
+# =============================
+with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
+    gr.Markdown("## 📄 Extraction de la 2ᵉ colonne (texte uniquement)")
+    run_btn = gr.Button("🔍 Lancer l'extraction")
+    status = gr.Textbox(label="Statut")
+    file_out = gr.File(label="Télécharger le CSV")
+    run_btn.click(
+        fn=run_extraction,
+        outputs=[status, file_out]
+    )
+demo.launch()