Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

App Files Files Community

kebson commited on Dec 17, 2025

Commit

00c654c

verified ·

1 Parent(s): 7d75986

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -30

app.py CHANGED Viewed

@@ -2,63 +2,85 @@ import os
 import cv2
 import pandas as pd
 from paddleocr import PaddleOCR
-# Initialisation OCR (CPU)
-ocr = PaddleOCR(use_angle_cls=True, lang="fr")
 def extract_second_column(image_path):
     """
-    Extrait le texte de la 2e colonne d'un tableau dans une image
     """
     result = ocr.ocr(image_path, cls=True)
-    column_2_text = []
-    for line in result[0]:
-        text = line[1][0]
-        column_2_text.append(text)
-    return column_2_text
-def main():
-    images_dir = "images"
-    # dossier persistant Hugging face
-    os.makedirs("/data", exist_ok=True)
-    if not os.path.exists(images_dir):
-        raise FileNotFoundError(
-            f"Le dossier '{images_dir}' est introuvable. "
-            "Vérifiez qu'il est bien copié dans le conteneur Docker."
-        )
-    all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
-            col2 = extract_second_column(image_path)
-            for value in col2:
                 all_results.append({
                     "image": filename,
-                    "colonne_2": value
                 })
     df = pd.DataFrame(all_results)
-    output_path="resultats_colonne_2.csv"
     df.to_csv(output_path, index=False)
-    print(f"✅ Extraction terminée:{output_path}")
-    print("Fichiers presents :",os.listdir("."))
-if __name__ == "__main__":
-    main()

 import cv2
 import pandas as pd
 from paddleocr import PaddleOCR
+import gradio as gr
+# Initialisation OCR (une seule fois)
+ocr = PaddleOCR(
+    use_angle_cls=True,
+    lang="fr",
+    show_log=False
+)
 def extract_second_column(image_path):
     """
+    Extrait le texte de la 2e colonne du tableau (approche par position X)
     """
     result = ocr.ocr(image_path, cls=True)
+    if not result or not result[0]:
+        return []
+    boxes = result[0]
+    # Trier par position horizontale (x)
+    boxes_sorted_x = sorted(boxes, key=lambda b: min(p[0] for p in b[0]))
+    # Regrouper en colonnes (heuristique)
+    columns = {}
+    for box in boxes_sorted_x:
+        x_coords = [p[0] for p in box[0]]
+        x_center = sum(x_coords) / len(x_coords)
+        columns.setdefault(int(x_center // 100), []).append(box)
+    # Trier les colonnes
+    sorted_cols = sorted(columns.items(), key=lambda x: x[0])
+    # Vérifier qu'il y a au moins 2 colonnes
+    if len(sorted_cols) < 2:
+        return []
+    second_col = sorted_cols[1][1]
+    # Trier verticalement
+    second_col_sorted = sorted(
+        second_col,
+        key=lambda b: min(p[1] for p in b[0])
+    )
+    texts = [b[1][0] for b in second_col_sorted]
+    return texts
+def main():
+    images_dir = "images"
+    output_dir = "/data"
+    os.makedirs(output_dir, exist_ok=True)
+    all_results = []
     for filename in sorted(os.listdir(images_dir)):
         if filename.lower().endswith((".jpg", ".jpeg", ".png")):
             image_path = os.path.join(images_dir, filename)
+            col2_values = extract_second_column(image_path)
+            for val in col2_values:
                 all_results.append({
                     "image": filename,
+                    "colonne_2": val
                 })
     df = pd.DataFrame(all_results)
+    output_path = os.path.join(output_dir, "resultats_colonne_2.csv")
     df.to_csv(output_path, index=False)
+    return output_path
+# Interface Gradio
+gr.Interface(
+    fn=main,
+    inputs=[],
+    outputs=gr.File(label="Télécharger le fichier CSV"),
+    title="Extraction OCR – Colonne 2 des tableaux",
+    description="Cliquez sur le bouton pour lancer l'OCR et télécharger le CSV."
+).launch()