kebson commited on
Commit
a6c6224
·
verified ·
1 Parent(s): 0a0c26d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -73
app.py CHANGED
@@ -1,102 +1,70 @@
1
  import os
2
- import re
3
  import cv2
 
4
  import pandas as pd
5
  import gradio as gr
6
- from paddleocr import PaddleOCR
7
 
8
- # ===============================
9
- # INITIALISATION OCR (UNE SEULE FOIS)
10
- # ===============================
11
- ocr = PaddleOCR(
12
- use_angle_cls=False,
13
- lang="en",
14
- show_log=False,
15
- use_gpu=False
16
- )
17
 
18
- # ===============================
19
- # FONCTION PRINCIPALE
20
- # ===============================
21
- def extract_second_column():
22
- image_dir = "images"
23
- image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
24
 
25
- if not image_files:
26
- return "❌ Aucune image trouvée", None, None
 
 
 
27
 
28
- image_path = os.path.join(image_dir, image_files[0])
29
 
30
- img = cv2.imread(image_path)
31
- if img is None:
32
- return "❌ Impossible de lire l'image", None, None
33
 
34
- result = ocr.ocr(img, cls=False)
 
 
35
 
36
- # Récupération des bounding boxes + texte
37
- rows = []
38
- for line in result[0]:
39
- box = line[0]
40
- text = line[1][0]
41
 
42
- x_center = sum([p[0] for p in box]) / 4
43
- y_center = sum([p[1] for p in box]) / 4
 
 
 
44
 
45
- rows.append((y_center, x_center, text))
46
 
47
- # Tri par ligne
48
- rows.sort(key=lambda x: x[0])
49
 
50
- # Regroupement par lignes
51
- lines = {}
52
- for y, x, text in rows:
53
- line_key = round(y / 25)
54
- lines.setdefault(line_key, []).append((x, text))
55
 
56
- second_column_texts = []
57
 
58
- for line in lines.values():
59
- line.sort(key=lambda x: x[0])
60
- if len(line) >= 2:
61
- candidate = line[1][1]
62
 
63
- # Garder uniquement les textes (pas chiffres)
64
- if not re.search(r"\d", candidate):
65
- second_column_texts.append(candidate)
66
 
67
- if not second_column_texts:
68
- return "⚠️ Aucun texte valide trouvé", None, None
69
-
70
- # ===============================
71
- # SORTIES
72
- # ===============================
73
- df = pd.DataFrame({"Colonne 2 (Texte)": second_column_texts})
74
-
75
- txt_path = "/tmp/resultats_colonne_2.txt"
76
- csv_path = "/tmp/resultats_colonne_2.csv"
77
-
78
- df.to_csv(csv_path, index=False)
79
- with open(txt_path, "w", encoding="utf-8") as f:
80
- for t in second_column_texts:
81
- f.write(t + "\n")
82
 
83
  return df, csv_path, txt_path
84
 
85
 
86
- # ===============================
87
- # INTERFACE GRADIO
88
- # ===============================
89
- with gr.Blocks() as demo:
90
- gr.Markdown("## 📄 Extraction OCR – Colonne 2 (Texte uniquement)")
91
 
92
- btn = gr.Button("🔍 Extraire la colonne 2")
93
- table = gr.Dataframe()
94
- csv_file = gr.File(label="📥 Télécharger CSV")
95
- txt_file = gr.File(label="📥 Télécharger TXT (Bloc-notes)")
96
 
97
  btn.click(
98
- extract_second_column,
99
- outputs=[table, csv_file, txt_file]
 
100
  )
101
 
102
  demo.launch()
 
1
  import os
 
2
  import cv2
3
+ import easyocr
4
  import pandas as pd
5
  import gradio as gr
6
+ from PIL import Image
7
 
8
+ # Initialisation EasyOCR (anglais + français si besoin)
9
+ reader = easyocr.Reader(['en', 'fr'], gpu=False)
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
11
 
12
+ def extract_second_column(image):
13
+ """
14
+ OCR + extraction naïve de la 2e colonne
15
+ """
16
+ img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
17
 
18
+ results = reader.readtext(img)
19
 
20
+ # On trie les résultats par position horizontale (x)
21
+ results_sorted = sorted(results, key=lambda x: x[0][0][0])
 
22
 
23
+ # On estime les colonnes par position X
24
+ xs = [r[0][0][0] for r in results_sorted]
25
+ median_x = sorted(xs)[len(xs)//2]
26
 
27
+ column_2 = []
 
 
 
 
28
 
29
+ for bbox, text, conf in results_sorted:
30
+ x = bbox[0][0]
31
+ if x > median_x: # 2e colonne
32
+ if text.strip():
33
+ column_2.append(text.strip())
34
 
35
+ return column_2
36
 
 
 
37
 
38
+ def process_image(image):
39
+ texts = extract_second_column(image)
 
 
 
40
 
41
+ df = pd.DataFrame({"Colonne 2 (Texte)": texts})
42
 
43
+ os.makedirs("/tmp/results", exist_ok=True)
 
 
 
44
 
45
+ csv_path = "/tmp/results/colonne_2.csv"
46
+ txt_path = "/tmp/results/colonne_2.txt"
 
47
 
48
+ df.to_csv(csv_path, index=False, encoding="utf-8")
49
+ df.to_csv(txt_path, index=False, header=False, encoding="utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  return df, csv_path, txt_path
52
 
53
 
54
+ with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
55
+ gr.Markdown("## 📄 Extraction OCR – Colonne 2 (EasyOCR)")
56
+
57
+ image_input = gr.Image(type="numpy", label="Télécharger une image")
58
+ btn = gr.Button("Extraire la colonne 2")
59
 
60
+ df_output = gr.Dataframe(label="Résultat")
61
+ csv_file = gr.File(label="Télécharger CSV")
62
+ txt_file = gr.File(label="Télécharger TXT")
 
63
 
64
  btn.click(
65
+ process_image,
66
+ inputs=image_input,
67
+ outputs=[df_output, csv_file, txt_file]
68
  )
69
 
70
  demo.launch()