Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 18, 2025

Commit

1d03c47

verified ·

1 Parent(s): 8931404

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -75

app.py CHANGED Viewed

@@ -1,109 +1,102 @@
 import os
-import re
-import gradio as gr
 import pandas as pd
-from paddleocr import PaddleOCR, PPStructure
-# =========================
-# Initialisation OCR
-# =========================
-ocr = PaddleOCR(use_angle_cls=True, lang="fr")
-table_engine = PPStructure(show_log=False)
-# =========================
-# Fonction principale
-# =========================
-def process_images(images):
-    all_rows = []
-    for img in images:
-        image_name = os.path.basename(img)
-        # Analyse de la structure du document
-        result = table_engine(img)
-        for block in result:
-            if block["type"] == "table":
-                html = block["res"]["html"]
-                # Lire le tableau HTML avec pandas
-                try:
-                    tables = pd.read_html(html)
-                except:
-                    continue
-                for table in tables:
-                    if table.shape[1] < 2:
-                        continue
-                    # Colonne 2 (index 1)
-                    col2 = table.iloc[:, 1]
-                    for cell in col2:
-                        if pd.isna(cell):
-                            continue
-                        text = str(cell).strip()
-                        # 🔴 FILTRE : on garde seulement les textes
-                        # (au moins une lettre)
-                        if not re.search(r"[A-Za-zÀ-ÿ]", text):
-                            continue
-                        all_rows.append({
-                            "image": image_name,
-                            "colonne_2": text
-                        })
-    # =========================
-    # Sauvegarde CSV
-    # =========================
-    df = pd.DataFrame(all_rows)
-    output_csv = "/app/resultats_colonne_2_textes.csv"
-    df.to_csv(output_csv, index=False, encoding="utf-8")
-    # =========================
-    # Sauvegarde TXT (Bloc-notes)
-    # =========================
-    output_txt = "/app/resultats_colonne_2.txt"
-    with open(output_txt, "w", encoding="utf-8") as f:
-        current_image = None
-        for row in all_rows:
-            if row["image"] != current_image:
-                current_image = row["image"]
-                f.write(f"\n===== {current_image} =====\n")
-            f.write(row["colonne_2"] + "\n")
-    return df, output_csv, output_txt
-# =========================
 # Interface Gradio
-# =========================
-with gr.Blocks(title="Extraction OCR – Colonne 2 (Textes)") as demo:
-    gr.Markdown("## 📄 Extraction OCR – 2ᵉ colonne (textes uniquement)")
-    images = gr.File(
         file_types=[".jpg", ".jpeg", ".png"],
-        file_count="multiple",
-        label="📤 Importer les images"
     )
-    btn = gr.Button("🚀 Générer")
-    table_output = gr.Dataframe(label="📊 Résultat (aperçu)")
-    csv_output = gr.File(label="⬇️ Télécharger CSV")
-    txt_output = gr.File(label="⬇️ Télécharger TXT")
     btn.click(
-        fn=process_images,
-        inputs=images,
         outputs=[table_output, csv_output, txt_output]
     )
 demo.launch()

 import os
+import cv2
 import pandas as pd
+import gradio as gr
+from paddleocr import PaddleOCR
+# Initialisation OCR (table + français)
+ocr = PaddleOCR(
+    use_angle_cls=True,
+    lang="fr",
+    show_log=False,
+    use_gpu=False
+)
+def extract_second_column(images):
+    results_col2 = []
+    for image_file in images:
+        image_path = image_file.name
+        ocr_result = ocr.ocr(image_path, cls=True)
+        if not ocr_result or not ocr_result[0]:
+            continue
+        # On trie les cellules par position verticale puis horizontale
+        cells = []
+        for line in ocr_result[0]:
+            box = line[0]
+            text = line[1][0]
+            x = box[0][0]
+            y = box[0][1]
+            cells.append((y, x, text))
+        cells.sort(key=lambda x: (x[0], x[1]))
+        # Reconstruction lignes
+        rows = []
+        current_row = []
+        last_y = None
+        threshold = 20
+        for y, x, text in cells:
+            if last_y is None or abs(y - last_y) < threshold:
+                current_row.append((x, text))
+            else:
+                rows.append(sorted(current_row))
+                current_row = [(x, text)]
+            last_y = y
+        if current_row:
+            rows.append(sorted(current_row))
+        # Extraction colonne 2 (index 1)
+        for row in rows:
+            if len(row) >= 2:
+                results_col2.append(row[1][1])
+    # Création des fichiers de sortie
+    os.makedirs("output", exist_ok=True)
+    df = pd.DataFrame({"colonne_2": results_col2})
+    csv_path = "output/resultats_colonne_2.csv"
+    txt_path = "output/resultats_colonne_2.txt"
+    df.to_csv(csv_path, index=False, encoding="utf-8")
+    with open(txt_path, "w", encoding="utf-8") as f:
+        for item in results_col2:
+            f.write(item + "\n")
+    return df, csv_path, txt_path
 # Interface Gradio
+with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
+    gr.Markdown("## 📄 Extraction OCR – Deuxième colonne des tableaux")
+    images_input = gr.File(
+        label="Téléverser les images (JPEG/PNG)",
         file_types=[".jpg", ".jpeg", ".png"],
+        file_count="multiple"
     )
+    btn = gr.Button("Extraire la colonne 2")
+    table_output = gr.Dataframe(label="Résultat – Colonne 2")
+    csv_output = gr.File(label="Télécharger CSV")
+    txt_output = gr.File(label="Télécharger TXT")
     btn.click(
+        fn=extract_second_column,
+        inputs=images_input,
         outputs=[table_output, csv_output, txt_output]
     )
 demo.launch()