Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 18, 2025

Commit

d8d4939

verified ·

1 Parent(s): 00c654c

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -37

app.py CHANGED Viewed

@@ -1,71 +1,62 @@
 import os
-import cv2
 import pandas as pd
 from paddleocr import PaddleOCR
 import gradio as gr
-# Initialisation OCR (une seule fois)
-ocr = PaddleOCR(
-    use_angle_cls=True,
-    lang="fr",
-    show_log=False
-)
-def extract_second_column(image_path):
-    """
-    Extrait le texte de la 2e colonne du tableau (approche par position X)
-    """
     result = ocr.ocr(image_path, cls=True)
     if not result or not result[0]:
         return []
     boxes = result[0]
-    # Trier par position horizontale (x)
-    boxes_sorted_x = sorted(boxes, key=lambda b: min(p[0] for p in b[0]))
-    # Regrouper en colonnes (heuristique)
     columns = {}
     for box in boxes_sorted_x:
-        x_coords = [p[0] for p in box[0]]
-        x_center = sum(x_coords) / len(x_coords)
-        columns.setdefault(int(x_center // 100), []).append(box)
-    # Trier les colonnes
-    sorted_cols = sorted(columns.items(), key=lambda x: x[0])
-    # Vérifier qu'il y a au moins 2 colonnes
-    if len(sorted_cols) < 2:
         return []
     second_col = sorted_cols[1][1]
-    # Trier verticalement
-    second_col_sorted = sorted(
         second_col,
         key=lambda b: min(p[1] for p in b[0])
     )
-    texts = [b[1][0] for b in second_col_sorted]
-    return texts
-def main():
     images_dir = "images"
     output_dir = "/data"
     os.makedirs(output_dir, exist_ok=True)
     all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
-            col2_values = extract_second_column(image_path)
-            for val in col2_values:
                 all_results.append({
                     "image": filename,
-                    "colonne_2": val
                 })
     df = pd.DataFrame(all_results)
@@ -74,13 +65,14 @@ def main():
     return output_path
-# Interface Gradio
 gr.Interface(
-    fn=main,
     inputs=[],
-    outputs=gr.File(label="Télécharger le fichier CSV"),
-    title="Extraction OCR – Colonne 2 des tableaux",
-    description="Cliquez sur le bouton pour lancer l'OCR et télécharger le CSV."
 ).launch()

 import os
 import pandas as pd
 from paddleocr import PaddleOCR
 import gradio as gr
+def extract_second_column(image_path, ocr):
     result = ocr.ocr(image_path, cls=True)
     if not result or not result[0]:
         return []
     boxes = result[0]
+    boxes_sorted_x = sorted(
+        boxes,
+        key=lambda b: min(p[0] for p in b[0])
+    )
     columns = {}
     for box in boxes_sorted_x:
+        x_center = sum(p[0] for p in box[0]) / 4
+        columns.setdefault(int(x_center // 120), []).append(box)
+    if len(columns) < 2:
         return []
+    sorted_cols = sorted(columns.items())
     second_col = sorted_cols[1][1]
+    second_col = sorted(
         second_col,
         key=lambda b: min(p[1] for p in b[0])
     )
+    return [b[1][0] for b in second_col]
+def run_ocr():
     images_dir = "images"
     output_dir = "/data"
     os.makedirs(output_dir, exist_ok=True)
+    # ⚠️ OCR INITIALISÉ ICI (PAS AU DÉMARRAGE)
+    ocr = PaddleOCR(
+        use_angle_cls=True,
+        lang="fr",
+        show_log=False,
+        cpu_threads=1
+    )
     all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
+            values = extract_second_column(image_path, ocr)
+            for v in values:
                 all_results.append({
                     "image": filename,
+                    "colonne_2": v
                 })
     df = pd.DataFrame(all_results)
     return output_path
 gr.Interface(
+    fn=run_ocr,
     inputs=[],
+    outputs=gr.File(label="Télécharger le CSV"),
+    title="Extraction OCR – Colonne 2",
+    description="Cliquez pour lancer l'OCR (première exécution plus lente)"
 ).launch()