kebson commited on
Commit
2728550
·
verified ·
1 Parent(s): 9becf7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -59
app.py CHANGED
@@ -2,42 +2,44 @@ import gradio as gr
2
  import numpy as np
3
  import unicodedata
4
  from paddleocr import PaddleOCR
5
- from sklearn.cluster import KMeans
6
 
7
  # -------------------------------------------------
8
  # OCR
9
  # -------------------------------------------------
10
- ocr = PaddleOCR(
11
- lang="fr",
12
- use_textline_orientation=True
13
- )
14
 
15
  # -------------------------------------------------
16
- # Normalisation texte (casse + accents)
17
  # -------------------------------------------------
18
- def normalize(text: str) -> str:
19
  text = text.lower()
20
  text = unicodedata.normalize("NFD", text)
21
  text = "".join(c for c in text if unicodedata.category(c) != "Mn")
22
  return " ".join(text.split())
23
 
24
  # -------------------------------------------------
25
- # Titres valides de la colonne 2
26
  # -------------------------------------------------
27
- COL_TITLES = {
28
  "designation",
29
  "designations",
30
  "description",
31
  "description des services"
32
- }
 
 
 
 
 
 
 
 
33
 
34
  # -------------------------------------------------
35
- # Mots / lignes à ignorer
36
  # -------------------------------------------------
37
- IGNORE_KEYWORDS = {
38
- "prix", "total", "ht", "htva", "tva",
39
- "ttc", "general", "generale"
40
- }
41
 
42
  # -------------------------------------------------
43
  # Fonction principale
@@ -49,73 +51,69 @@ def extract_second_column(image):
49
  img = np.array(image)
50
  result = ocr.predict(img)
51
 
52
- if not result:
53
  return "OCR : aucun texte détecté."
54
 
55
- data = result[0]
56
- texts = data.get("rec_texts", [])
57
- boxes = data.get("dt_polys", [])
58
 
59
  blocks = []
60
  for text, box in zip(texts, boxes):
61
- t = text.strip()
62
- if len(t) < 2:
63
  continue
64
 
65
  x = np.mean([p[0] for p in box])
66
  y = np.mean([p[1] for p in box])
67
-
68
- blocks.append((t, x, y))
69
 
70
  if len(blocks) < 5:
71
  return "Pas assez de texte exploitable."
72
 
73
  # -------------------------------------------------
74
- # 1. Détection du X de la colonne cible via son titre
75
  # -------------------------------------------------
76
  col_x = None
77
  for text, x, y in blocks:
78
- if normalize(text) in COL_TITLES:
 
79
  col_x = x
80
  break
81
 
82
  if col_x is None:
83
- return "Titre de la colonne cible non détecté."
84
 
85
  # -------------------------------------------------
86
- # 2. Sélection des blocs proches du X détecté
87
  # -------------------------------------------------
88
- X_THRESHOLD = 45
89
- column_blocks = [
90
- (t, x, y) for t, x, y in blocks
91
- if abs(x - col_x) < X_THRESHOLD
92
- ]
93
 
94
- if not column_blocks:
95
  return "Colonne détectée mais vide."
96
 
97
  # -------------------------------------------------
98
- # 3. Tri vertical (haut → bas)
99
  # -------------------------------------------------
100
- column_blocks.sort(key=lambda e: e[2])
101
 
102
  # -------------------------------------------------
103
- # 4. Fusion intelligente des lignes OCR
104
  # -------------------------------------------------
105
- merged = []
106
  current = ""
107
  last_y = None
108
- Y_THRESHOLD = 22
109
 
110
- for text, x, y in column_blocks:
111
  nt = normalize(text)
112
 
113
  if any(k in nt for k in IGNORE_KEYWORDS):
114
  continue
115
 
116
- if last_y is None or abs(y - last_y) > Y_THRESHOLD:
117
  if current:
118
- merged.append(current.strip())
119
  current = text
120
  else:
121
  current += " " + text
@@ -123,40 +121,34 @@ def extract_second_column(image):
123
  last_y = y
124
 
125
  if current:
126
- merged.append(current.strip())
127
 
128
  # -------------------------------------------------
129
- # 5. Nettoyage final (cellules texte uniquement)
130
  # -------------------------------------------------
131
  final = []
132
- for line in merged:
133
- nt = normalize(line)
134
  if len(nt) < 4:
135
  continue
136
- if sum(c.isdigit() for c in line) > len(line) / 2:
137
  continue
138
- final.append(line)
139
 
140
  if not final:
141
- return "Aucune cellule texte valide trouvée."
142
 
143
- # -------------------------------------------------
144
- # 6. Résultat numéroté
145
- # -------------------------------------------------
146
- return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
147
 
148
  # -------------------------------------------------
149
- # Interface Gradio
150
  # -------------------------------------------------
151
  demo = gr.Interface(
152
  fn=extract_second_column,
153
  inputs=gr.Image(type="pil", label="Image du tableau"),
154
- outputs=gr.Textbox(label="Contenu de la colonne 2"),
155
- title="Extraction fiable de la colonne 2 (Désignation / Description)",
156
- description=(
157
- "Extraction robuste de la deuxième colonne des tableaux scannés "
158
- "(Désignation, DESIGNATIONS, Description, Description des services)."
159
- )
160
  )
161
 
162
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import numpy as np
3
  import unicodedata
4
  from paddleocr import PaddleOCR
 
5
 
6
  # -------------------------------------------------
7
  # OCR
8
  # -------------------------------------------------
9
+ ocr = PaddleOCR(lang="fr", use_textline_orientation=True)
 
 
 
10
 
11
  # -------------------------------------------------
12
+ # Normalisation
13
  # -------------------------------------------------
14
+ def normalize(text):
15
  text = text.lower()
16
  text = unicodedata.normalize("NFD", text)
17
  text = "".join(c for c in text if unicodedata.category(c) != "Mn")
18
  return " ".join(text.split())
19
 
20
  # -------------------------------------------------
21
+ # Titres possibles colonne 2
22
  # -------------------------------------------------
23
+ COL_TITLES = [
24
  "designation",
25
  "designations",
26
  "description",
27
  "description des services"
28
+ ]
29
+
30
+ # -------------------------------------------------
31
+ # Lignes à ignorer
32
+ # -------------------------------------------------
33
+ IGNORE_KEYWORDS = [
34
+ "prix", "total", "ht", "htva", "tva", "ttc",
35
+ "general", "generale"
36
+ ]
37
 
38
  # -------------------------------------------------
39
+ # Détection début cellule (règle métier)
40
  # -------------------------------------------------
41
+ def is_new_cell(text):
42
+ return text and text[0].isupper()
 
 
43
 
44
  # -------------------------------------------------
45
  # Fonction principale
 
51
  img = np.array(image)
52
  result = ocr.predict(img)
53
 
54
+ if not result or not result[0]:
55
  return "OCR : aucun texte détecté."
56
 
57
+ texts = result[0]["rec_texts"]
58
+ boxes = result[0]["dt_polys"]
 
59
 
60
  blocks = []
61
  for text, box in zip(texts, boxes):
62
+ text = text.strip()
63
+ if len(text) < 2:
64
  continue
65
 
66
  x = np.mean([p[0] for p in box])
67
  y = np.mean([p[1] for p in box])
68
+ blocks.append((text, x, y))
 
69
 
70
  if len(blocks) < 5:
71
  return "Pas assez de texte exploitable."
72
 
73
  # -------------------------------------------------
74
+ # 1. Détection X colonne 2 par le TITRE (robuste)
75
  # -------------------------------------------------
76
  col_x = None
77
  for text, x, y in blocks:
78
+ nt = normalize(text)
79
+ if any(nt.startswith(t) for t in COL_TITLES):
80
  col_x = x
81
  break
82
 
83
  if col_x is None:
84
+ return "Titre de la colonne 2 non détecté."
85
 
86
  # -------------------------------------------------
87
+ # 2. Sélection blocs proches de X
88
  # -------------------------------------------------
89
+ X_THRESHOLD = 60
90
+ col_blocks = [(t, x, y) for t, x, y in blocks if abs(x - col_x) < X_THRESHOLD]
 
 
 
91
 
92
+ if not col_blocks:
93
  return "Colonne détectée mais vide."
94
 
95
  # -------------------------------------------------
96
+ # 3. Tri vertical
97
  # -------------------------------------------------
98
+ col_blocks.sort(key=lambda e: e[2])
99
 
100
  # -------------------------------------------------
101
+ # 4. Reconstruction cellules (RÈGLE MAJUSCULE)
102
  # -------------------------------------------------
103
+ cells = []
104
  current = ""
105
  last_y = None
106
+ Y_THRESHOLD = 28
107
 
108
+ for text, x, y in col_blocks:
109
  nt = normalize(text)
110
 
111
  if any(k in nt for k in IGNORE_KEYWORDS):
112
  continue
113
 
114
+ if current == "" or is_new_cell(text) or (last_y and abs(y - last_y) > Y_THRESHOLD):
115
  if current:
116
+ cells.append(current.strip())
117
  current = text
118
  else:
119
  current += " " + text
 
121
  last_y = y
122
 
123
  if current:
124
+ cells.append(current.strip())
125
 
126
  # -------------------------------------------------
127
+ # 5. Nettoyage final
128
  # -------------------------------------------------
129
  final = []
130
+ for c in cells:
131
+ nt = normalize(c)
132
  if len(nt) < 4:
133
  continue
134
+ if sum(ch.isdigit() for ch in c) > len(c) * 0.6:
135
  continue
136
+ final.append(c)
137
 
138
  if not final:
139
+ return "Aucune cellule valide trouvée."
140
 
141
+ return "\n".join(f"{i+1}. {c}" for i, c in enumerate(final))
 
 
 
142
 
143
  # -------------------------------------------------
144
+ # Gradio
145
  # -------------------------------------------------
146
  demo = gr.Interface(
147
  fn=extract_second_column,
148
  inputs=gr.Image(type="pil", label="Image du tableau"),
149
+ outputs=gr.Textbox(label="Contenu colonne 2", lines=15),
150
+ title="Extraction fiable de la colonne 2",
151
+ description="Extraction robuste de la colonne 2 (Désignation / Description)"
 
 
 
152
  )
153
 
154
  demo.launch(server_name="0.0.0.0", server_port=7860)