Spaces:

kebson
/

paddleocr-table-extraction

Runtime error

kebson commited on Dec 17, 2025

Commit

a3d2b53

verified ·

1 Parent(s): 0f50d7c

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import os
+import numpy as np
+import pandas as pd
+from paddleocr import PaddleOCR
+IMAGE_DIR = "images"
+OUTPUT_FILE = "resultats_colonne_2.csv"
+ocr = PaddleOCR(lang="fr", use_angle_cls=True)
+def extract_second_column(image_path):
+result = ocr.ocr(image_path, cls=True)
+if result is None or len(result[0]) == 0:
+return []
+elements = []
+for line in result[0]:
+box = line[0]
+text = line[1][0]
+x = sum(p[0] for p in box) / 4
+y = sum(p[1] for p in box) / 4
+elements.append((x, y, text))
+elements.sort(key=lambda e: e[1])
+rows = []
+current = []
+threshold = 25
+for e in elements:
+if not current:
+current.append(e)
+elif abs(e[1] - current[-1][1]) < threshold:
+current.append(e)
+else:
+rows.append(current)
+current = [e]
+rows.append(current)
+col2 = []
+for row in rows:
+row_sorted = sorted(row, key=lambda e: e[0])
+if len(row_sorted) >= 2:
+col2.append(row_sorted[1][2])
+return col2
+def main():
+data = []
+if not os.path.exists(IMAGE_DIR):
+print("❌ Dossier 'images' introuvable")
+return
+for img in os.listdir(IMAGE_DIR):
+if img.lower().endswith(".jpeg"):
+values = extract_second_column(os.path.join(IMAGE_DIR, img))
+for v in values:
+data.append({
+"image": img,
+"colonne_2": v
+})
+df = pd.DataFrame(data)
+df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
+print("✅ Extraction terminée")
+print(df.head())
+if __name__ == "__main__":
+main()