kebson commited on
Commit
e981ea9
·
verified ·
1 Parent(s): 254a470

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -56
app.py CHANGED
@@ -1,74 +1,46 @@
1
  import os
2
- import numpy as np
3
  import pandas as pd
4
  from paddleocr import PaddleOCR
5
 
6
- IMAGE_DIR = "images"
7
- OUTPUT_FILE = "resultats_colonne_2.csv"
8
-
9
- ocr = PaddleOCR(lang="fr", use_angle_cls=True)
10
 
11
  def extract_second_column(image_path):
12
- result = ocr.ocr(image_path, cls=True)
13
-
14
- if result is None or len(result[0]) == 0:
15
- return []
16
-
17
- elements = []
18
- for line in result[0]:
19
- box = line[0]
20
- text = line[1][0]
21
-
22
- x = sum(p[0] for p in box) / 4
23
- y = sum(p[1] for p in box) / 4
24
 
25
- elements.append((x, y, text))
26
 
27
- elements.sort(key=lambda e: e[1])
 
 
28
 
29
- rows = []
30
- current = []
31
- threshold = 25
32
 
33
- for e in elements:
34
- if not current:
35
- current.append(e)
36
- elif abs(e[1] - current[-1][1]) < threshold:
37
- current.append(e)
38
- else:
39
- rows.append(current)
40
- current = [e]
41
- rows.append(current)
42
-
43
- col2 = []
44
- for row in rows:
45
- row_sorted = sorted(row, key=lambda e: e[0])
46
- if len(row_sorted) >= 2:
47
- col2.append(row_sorted[1][2])
48
-
49
- return col2
50
 
51
  def main():
52
- data = []
 
53
 
54
- if not os.path.exists(IMAGE_DIR):
55
- print(" Dossier 'images' introuvable")
56
- return
 
57
 
58
- for img in os.listdir(IMAGE_DIR):
59
- if img.lower().endswith(".jpeg"):
60
- values = extract_second_column(os.path.join(IMAGE_DIR, img))
61
- for v in values:
62
- data.append({
63
- "image": img,
64
- "colonne_2": v
65
- })
66
 
67
- df = pd.DataFrame(data)
68
- df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
 
69
 
70
- print("✅ Extraction terminée")
71
- print(df.head())
72
 
73
  if __name__ == "__main__":
74
- main()
 
 
1
  import os
2
+ import cv2
3
  import pandas as pd
4
  from paddleocr import PaddleOCR
5
 
6
+ # Initialisation OCR (CPU)
7
+ ocr = PaddleOCR(use_angle_cls=True, lang="fr")
 
 
8
 
9
  def extract_second_column(image_path):
10
+ """
11
+ Extrait le texte de la 2e colonne d'un tableau dans une image
12
+ """
13
+ result = ocr.ocr(image_path, cls=True)
 
 
 
 
 
 
 
 
14
 
15
+ column_2_text = []
16
 
17
+ for line in result[0]:
18
+ text = line[1][0]
19
+ column_2_text.append(text)
20
 
21
+ return column_2_text
 
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def main():
25
+ images_dir = "images"
26
+ all_results = []
27
 
28
+ for filename in sorted(os.listdir(images_dir)):
29
+ if filename.lower().endswith((".jpg", ".jpeg", ".png")):
30
+ image_path = os.path.join(images_dir, filename)
31
+ col2 = extract_second_column(image_path)
32
 
33
+ for value in col2:
34
+ all_results.append({
35
+ "image": filename,
36
+ "colonne_2": value
37
+ })
 
 
 
38
 
39
+ df = pd.DataFrame(all_results)
40
+ df.to_csv("resultats_colonne_2.csv", index=False)
41
+ print("✅ Extraction terminée")
42
 
 
 
43
 
44
  if __name__ == "__main__":
45
+ main()
46
+