kebson commited on
Commit
1d03c47
·
verified ·
1 Parent(s): 8931404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -75
app.py CHANGED
@@ -1,109 +1,102 @@
1
  import os
2
- import re
3
- import gradio as gr
4
  import pandas as pd
5
- from paddleocr import PaddleOCR, PPStructure
6
-
7
- # =========================
8
- # Initialisation OCR
9
- # =========================
10
- ocr = PaddleOCR(use_angle_cls=True, lang="fr")
11
- table_engine = PPStructure(show_log=False)
12
-
13
- # =========================
14
- # Fonction principale
15
- # =========================
16
- def process_images(images):
17
- all_rows = []
18
 
19
- for img in images:
20
- image_name = os.path.basename(img)
 
 
 
 
 
21
 
22
- # Analyse de la structure du document
23
- result = table_engine(img)
24
 
25
- for block in result:
26
- if block["type"] == "table":
27
- html = block["res"]["html"]
28
 
29
- # Lire le tableau HTML avec pandas
30
- try:
31
- tables = pd.read_html(html)
32
- except:
33
- continue
34
 
35
- for table in tables:
36
- if table.shape[1] < 2:
37
- continue
38
 
39
- # Colonne 2 (index 1)
40
- col2 = table.iloc[:, 1]
 
 
 
 
 
 
41
 
42
- for cell in col2:
43
- if pd.isna(cell):
44
- continue
45
 
46
- text = str(cell).strip()
 
 
 
 
47
 
48
- # 🔴 FILTRE : on garde seulement les textes
49
- # (au moins une lettre)
50
- if not re.search(r"[A-Za-zÀ-ÿ]", text):
51
- continue
 
 
 
52
 
53
- all_rows.append({
54
- "image": image_name,
55
- "colonne_2": text
56
- })
57
 
58
- # =========================
59
- # Sauvegarde CSV
60
- # =========================
61
- df = pd.DataFrame(all_rows)
62
 
63
- output_csv = "/app/resultats_colonne_2_textes.csv"
64
- df.to_csv(output_csv, index=False, encoding="utf-8")
65
 
66
- # =========================
67
- # Sauvegarde TXT (Bloc-notes)
68
- # =========================
69
- output_txt = "/app/resultats_colonne_2.txt"
70
 
71
- with open(output_txt, "w", encoding="utf-8") as f:
72
- current_image = None
73
- for row in all_rows:
74
- if row["image"] != current_image:
75
- current_image = row["image"]
76
- f.write(f"\n===== {current_image} =====\n")
77
- f.write(row["colonne_2"] + "\n")
78
 
79
- return df, output_csv, output_txt
80
 
81
 
82
- # =========================
83
  # Interface Gradio
84
- # =========================
85
- with gr.Blocks(title="Extraction OCR – Colonne 2 (Textes)") as demo:
86
- gr.Markdown("## 📄 Extraction OCR – 2ᵉ colonne (textes uniquement)")
87
 
88
- images = gr.File(
 
89
  file_types=[".jpg", ".jpeg", ".png"],
90
- file_count="multiple",
91
- label="📤 Importer les images"
92
  )
93
 
94
- btn = gr.Button("🚀 Générer")
95
 
96
- table_output = gr.Dataframe(label="📊 Résultat (aperçu)")
97
- csv_output = gr.File(label="⬇️ Télécharger CSV")
98
- txt_output = gr.File(label="⬇️ Télécharger TXT")
99
 
100
  btn.click(
101
- fn=process_images,
102
- inputs=images,
103
  outputs=[table_output, csv_output, txt_output]
104
  )
105
 
106
  demo.launch()
 
 
 
107
 
108
 
109
 
 
1
  import os
2
+ import cv2
 
3
  import pandas as pd
4
+ import gradio as gr
5
+ from paddleocr import PaddleOCR
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Initialisation OCR (table + français)
8
+ ocr = PaddleOCR(
9
+ use_angle_cls=True,
10
+ lang="fr",
11
+ show_log=False,
12
+ use_gpu=False
13
+ )
14
 
15
+ def extract_second_column(images):
16
+ results_col2 = []
17
 
18
+ for image_file in images:
19
+ image_path = image_file.name
 
20
 
21
+ ocr_result = ocr.ocr(image_path, cls=True)
 
 
 
 
22
 
23
+ if not ocr_result or not ocr_result[0]:
24
+ continue
 
25
 
26
+ # On trie les cellules par position verticale puis horizontale
27
+ cells = []
28
+ for line in ocr_result[0]:
29
+ box = line[0]
30
+ text = line[1][0]
31
+ x = box[0][0]
32
+ y = box[0][1]
33
+ cells.append((y, x, text))
34
 
35
+ cells.sort(key=lambda x: (x[0], x[1]))
 
 
36
 
37
+ # Reconstruction lignes
38
+ rows = []
39
+ current_row = []
40
+ last_y = None
41
+ threshold = 20
42
 
43
+ for y, x, text in cells:
44
+ if last_y is None or abs(y - last_y) < threshold:
45
+ current_row.append((x, text))
46
+ else:
47
+ rows.append(sorted(current_row))
48
+ current_row = [(x, text)]
49
+ last_y = y
50
 
51
+ if current_row:
52
+ rows.append(sorted(current_row))
 
 
53
 
54
+ # Extraction colonne 2 (index 1)
55
+ for row in rows:
56
+ if len(row) >= 2:
57
+ results_col2.append(row[1][1])
58
 
59
+ # Création des fichiers de sortie
60
+ os.makedirs("output", exist_ok=True)
61
 
62
+ df = pd.DataFrame({"colonne_2": results_col2})
63
+ csv_path = "output/resultats_colonne_2.csv"
64
+ txt_path = "output/resultats_colonne_2.txt"
 
65
 
66
+ df.to_csv(csv_path, index=False, encoding="utf-8")
67
+ with open(txt_path, "w", encoding="utf-8") as f:
68
+ for item in results_col2:
69
+ f.write(item + "\n")
 
 
 
70
 
71
+ return df, csv_path, txt_path
72
 
73
 
 
74
  # Interface Gradio
75
+ with gr.Blocks(title="Extraction OCR – Colonne 2") as demo:
76
+ gr.Markdown("## 📄 Extraction OCR – Deuxième colonne des tableaux")
 
77
 
78
+ images_input = gr.File(
79
+ label="Téléverser les images (JPEG/PNG)",
80
  file_types=[".jpg", ".jpeg", ".png"],
81
+ file_count="multiple"
 
82
  )
83
 
84
+ btn = gr.Button("Extraire la colonne 2")
85
 
86
+ table_output = gr.Dataframe(label="Résultat – Colonne 2")
87
+ csv_output = gr.File(label="Télécharger CSV")
88
+ txt_output = gr.File(label="Télécharger TXT")
89
 
90
  btn.click(
91
+ fn=extract_second_column,
92
+ inputs=images_input,
93
  outputs=[table_output, csv_output, txt_output]
94
  )
95
 
96
  demo.launch()
97
+
98
+
99
+
100
 
101
 
102