kebson commited on
Commit
a3d2b53
·
verified ·
1 Parent(s): 0f50d7c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ from paddleocr import PaddleOCR
5
+
6
+ IMAGE_DIR = "images"
7
+ OUTPUT_FILE = "resultats_colonne_2.csv"
8
+
9
+ ocr = PaddleOCR(lang="fr", use_angle_cls=True)
10
+
11
+ def extract_second_column(image_path):
12
+ result = ocr.ocr(image_path, cls=True)
13
+
14
+ if result is None or len(result[0]) == 0:
15
+ return []
16
+
17
+ elements = []
18
+ for line in result[0]:
19
+ box = line[0]
20
+ text = line[1][0]
21
+
22
+ x = sum(p[0] for p in box) / 4
23
+ y = sum(p[1] for p in box) / 4
24
+
25
+ elements.append((x, y, text))
26
+
27
+ elements.sort(key=lambda e: e[1])
28
+
29
+ rows = []
30
+ current = []
31
+ threshold = 25
32
+
33
+ for e in elements:
34
+ if not current:
35
+ current.append(e)
36
+ elif abs(e[1] - current[-1][1]) < threshold:
37
+ current.append(e)
38
+ else:
39
+ rows.append(current)
40
+ current = [e]
41
+ rows.append(current)
42
+
43
+ col2 = []
44
+ for row in rows:
45
+ row_sorted = sorted(row, key=lambda e: e[0])
46
+ if len(row_sorted) >= 2:
47
+ col2.append(row_sorted[1][2])
48
+
49
+ return col2
50
+
51
+ def main():
52
+ data = []
53
+
54
+ if not os.path.exists(IMAGE_DIR):
55
+ print("❌ Dossier 'images' introuvable")
56
+ return
57
+
58
+ for img in os.listdir(IMAGE_DIR):
59
+ if img.lower().endswith(".jpeg"):
60
+ values = extract_second_column(os.path.join(IMAGE_DIR, img))
61
+ for v in values:
62
+ data.append({
63
+ "image": img,
64
+ "colonne_2": v
65
+ })
66
+
67
+ df = pd.DataFrame(data)
68
+ df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
69
+
70
+ print("✅ Extraction terminée")
71
+ print(df.head())
72
+
73
+ if __name__ == "__main__":
74
+ main()