Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

kebson commited on Dec 17, 2025

Commit

e981ea9

verified ·

1 Parent(s): 254a470

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,74 +1,46 @@
 import os
-import numpy as np
 import pandas as pd
 from paddleocr import PaddleOCR
-IMAGE_DIR = "images"
-OUTPUT_FILE = "resultats_colonne_2.csv"
-ocr = PaddleOCR(lang="fr", use_angle_cls=True)
 def extract_second_column(image_path):
-result = ocr.ocr(image_path, cls=True)
-if result is None or len(result[0]) == 0:
-return []
-elements = []
-for line in result[0]:
-box = line[0]
-text = line[1][0]
-x = sum(p[0] for p in box) / 4
-y = sum(p[1] for p in box) / 4
-elements.append((x, y, text))
-elements.sort(key=lambda e: e[1])
-rows = []
-current = []
-threshold = 25
-for e in elements:
-if not current:
-current.append(e)
-elif abs(e[1] - current[-1][1]) < threshold:
-current.append(e)
-else:
-rows.append(current)
-current = [e]
-rows.append(current)
-col2 = []
-for row in rows:
-row_sorted = sorted(row, key=lambda e: e[0])
-if len(row_sorted) >= 2:
-col2.append(row_sorted[1][2])
-return col2
 def main():
-data = []
-if not os.path.exists(IMAGE_DIR):
-print("❌ Dossier 'images' introuvable")
-return
-for img in os.listdir(IMAGE_DIR):
-if img.lower().endswith(".jpeg"):
-values = extract_second_column(os.path.join(IMAGE_DIR, img))
-for v in values:
-data.append({
-"image": img,
-"colonne_2": v
-})
-df = pd.DataFrame(data)
-df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
-print("✅ Extraction terminée")
-print(df.head())
 if __name__ == "__main__":
-main()

 import os
+import cv2
 import pandas as pd
 from paddleocr import PaddleOCR
+# Initialisation OCR (CPU)
+ocr = PaddleOCR(use_angle_cls=True, lang="fr")
 def extract_second_column(image_path):
+    """
+    Extrait le texte de la 2e colonne d'un tableau dans une image
+    """
+    result = ocr.ocr(image_path, cls=True)
+    column_2_text = []
+    for line in result[0]:
+        text = line[1][0]
+        column_2_text.append(text)
+    return column_2_text
 def main():
+    images_dir = "images"
+    all_results = []
+    for filename in sorted(os.listdir(images_dir)):
+        if filename.lower().endswith((".jpg", ".jpeg", ".png")):
+            image_path = os.path.join(images_dir, filename)
+            col2 = extract_second_column(image_path)
+            for value in col2:
+                all_results.append({
+                    "image": filename,
+                    "colonne_2": value
+                })
+    df = pd.DataFrame(all_results)
+    df.to_csv("resultats_colonne_2.csv", index=False)
+    print("✅ Extraction terminée")
 if __name__ == "__main__":
+    main()