kebson commited on
Commit
a1e0d1a
·
verified ·
1 Parent(s): 7e77d30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -92
app.py CHANGED
@@ -1,139 +1,162 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  from paddleocr import PaddleOCR
4
  from sklearn.cluster import KMeans
5
 
6
- # -----------------------------
7
  # OCR
8
- # -----------------------------
9
  ocr = PaddleOCR(
10
- use_textline_orientation=True,
11
- lang="fr"
12
  )
13
 
14
- # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Fonction principale
16
- # -----------------------------
17
- def extract_column2_9_lines(image):
18
  if image is None:
19
  return "Aucune image fournie."
20
 
21
  img = np.array(image)
22
  result = ocr.predict(img)
23
 
24
- if not result or len(result) == 0:
25
- return "OCR exécuté mais aucun texte détecté."
26
 
27
  data = result[0]
28
  texts = data.get("rec_texts", [])
29
  boxes = data.get("dt_polys", [])
30
 
31
- if not texts:
32
- return "Aucun texte exploitable détecté."
33
-
34
- # -----------------------------
35
- # 1. Collecte OCR
36
- # -----------------------------
37
- elements = []
38
  for text, box in zip(texts, boxes):
39
- text = text.strip()
40
- if len(text) < 3:
41
  continue
42
 
43
- x_center = np.mean([p[0] for p in box])
44
- y_center = np.mean([p[1] for p in box])
45
-
46
- elements.append((x_center, y_center, text))
47
-
48
- if len(elements) < 5:
49
- return "Pas assez de texte détecté."
50
-
51
- # -----------------------------
52
- # 2. Clustering horizontal ADAPTATIF
53
- # -----------------------------
54
- X = np.array([[e[0]] for e in elements])
55
- n_clusters = min(8, max(3, len(elements) // 8))
56
-
57
- kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
58
- labels = kmeans.fit_predict(X)
59
-
60
- columns = {}
61
- for (x, y, text), label in zip(elements, labels):
62
- columns.setdefault(label, []).append((x, y, text))
63
-
64
- # -----------------------------
65
- # 3. Choisir la colonne "Description"
66
- # => la plus riche en texte non numérique
67
- # -----------------------------
68
- def column_score(col):
69
- score = 0
70
- for _, _, t in col:
71
- if not any(char.isdigit() for char in t):
72
- score += len(t)
73
- return score
74
-
75
- best_column = max(columns.values(), key=column_score)
76
-
77
- # Tri vertical
78
- best_column.sort(key=lambda e: e[1])
79
-
80
- # -----------------------------
81
- # 4. Fusion intelligente des lignes
82
- # -----------------------------
83
- merged_lines = []
84
- current_text = ""
85
  last_y = None
86
  Y_THRESHOLD = 22
87
 
88
- blacklist = (
89
- "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
90
- "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
91
- )
92
 
93
- for _, y, text in best_column:
94
- if text.upper().startswith(blacklist):
95
  continue
96
 
97
  if last_y is None or abs(y - last_y) > Y_THRESHOLD:
98
- if current_text:
99
- merged_lines.append(current_text.strip())
100
- current_text = text
101
  else:
102
- current_text += " " + text
103
 
104
  last_y = y
105
 
106
- if current_text:
107
- merged_lines.append(current_text.strip())
108
 
109
- # -----------------------------
110
- # 5. Nettoyage final
111
- # -----------------------------
112
- cleaned = []
113
- for line in merged_lines:
114
- if len(line) < 5:
 
115
  continue
116
  if sum(c.isdigit() for c in line) > len(line) / 2:
117
  continue
118
- cleaned.append(line)
119
-
120
- final_lines = cleaned[:9]
121
 
122
- if not final_lines:
123
- return "Colonne détectée mais contenu non exploitable."
124
 
125
- # Numérotation demandée
126
- return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
 
 
127
 
128
- # -----------------------------
129
  # Interface Gradio
130
- # -----------------------------
131
  demo = gr.Interface(
132
- fn=extract_column2_9_lines,
133
  inputs=gr.Image(type="pil", label="Image du tableau"),
134
- outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
135
- title="Extraction robuste de la colonne Description",
136
- description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
 
 
 
137
  )
138
 
139
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import numpy as np
3
+ import unicodedata
4
  from paddleocr import PaddleOCR
5
  from sklearn.cluster import KMeans
6
 
7
+ # -------------------------------------------------
8
  # OCR
9
+ # -------------------------------------------------
10
  ocr = PaddleOCR(
11
+ lang="fr",
12
+ use_textline_orientation=True
13
  )
14
 
15
+ # -------------------------------------------------
16
+ # Normalisation texte (casse + accents)
17
+ # -------------------------------------------------
18
+ def normalize(text: str) -> str:
19
+ text = text.lower()
20
+ text = unicodedata.normalize("NFD", text)
21
+ text = "".join(c for c in text if unicodedata.category(c) != "Mn")
22
+ return " ".join(text.split())
23
+
24
+ # -------------------------------------------------
25
+ # Titres valides de la colonne 2
26
+ # -------------------------------------------------
27
+ COL_TITLES = {
28
+ "designation",
29
+ "designations",
30
+ "description",
31
+ "description des services"
32
+ }
33
+
34
+ # -------------------------------------------------
35
+ # Mots / lignes à ignorer
36
+ # -------------------------------------------------
37
+ IGNORE_KEYWORDS = {
38
+ "prix", "total", "ht", "htva", "tva",
39
+ "ttc", "general", "generale"
40
+ }
41
+
42
+ # -------------------------------------------------
43
  # Fonction principale
44
+ # -------------------------------------------------
45
+ def extract_second_column(image):
46
  if image is None:
47
  return "Aucune image fournie."
48
 
49
  img = np.array(image)
50
  result = ocr.predict(img)
51
 
52
+ if not result:
53
+ return "OCR : aucun texte détecté."
54
 
55
  data = result[0]
56
  texts = data.get("rec_texts", [])
57
  boxes = data.get("dt_polys", [])
58
 
59
+ blocks = []
 
 
 
 
 
 
60
  for text, box in zip(texts, boxes):
61
+ t = text.strip()
62
+ if len(t) < 2:
63
  continue
64
 
65
+ x = np.mean([p[0] for p in box])
66
+ y = np.mean([p[1] for p in box])
67
+
68
+ blocks.append((t, x, y))
69
+
70
+ if len(blocks) < 5:
71
+ return "Pas assez de texte exploitable."
72
+
73
+ # -------------------------------------------------
74
+ # 1. Détection du X de la colonne cible via son titre
75
+ # -------------------------------------------------
76
+ col_x = None
77
+ for text, x, y in blocks:
78
+ if normalize(text) in COL_TITLES:
79
+ col_x = x
80
+ break
81
+
82
+ if col_x is None:
83
+ return "Titre de la colonne cible non détecté."
84
+
85
+ # -------------------------------------------------
86
+ # 2. Sélection des blocs proches du X détecté
87
+ # -------------------------------------------------
88
+ X_THRESHOLD = 45
89
+ column_blocks = [
90
+ (t, x, y) for t, x, y in blocks
91
+ if abs(x - col_x) < X_THRESHOLD
92
+ ]
93
+
94
+ if not column_blocks:
95
+ return "Colonne détectée mais vide."
96
+
97
+ # -------------------------------------------------
98
+ # 3. Tri vertical (haut → bas)
99
+ # -------------------------------------------------
100
+ column_blocks.sort(key=lambda e: e[2])
101
+
102
+ # -------------------------------------------------
103
+ # 4. Fusion intelligente des lignes OCR
104
+ # -------------------------------------------------
105
+ merged = []
106
+ current = ""
107
  last_y = None
108
  Y_THRESHOLD = 22
109
 
110
+ for text, x, y in column_blocks:
111
+ nt = normalize(text)
 
 
112
 
113
+ if any(k in nt for k in IGNORE_KEYWORDS):
 
114
  continue
115
 
116
  if last_y is None or abs(y - last_y) > Y_THRESHOLD:
117
+ if current:
118
+ merged.append(current.strip())
119
+ current = text
120
  else:
121
+ current += " " + text
122
 
123
  last_y = y
124
 
125
+ if current:
126
+ merged.append(current.strip())
127
 
128
+ # -------------------------------------------------
129
+ # 5. Nettoyage final (cellules texte uniquement)
130
+ # -------------------------------------------------
131
+ final = []
132
+ for line in merged:
133
+ nt = normalize(line)
134
+ if len(nt) < 4:
135
  continue
136
  if sum(c.isdigit() for c in line) > len(line) / 2:
137
  continue
138
+ final.append(line)
 
 
139
 
140
+ if not final:
141
+ return "Aucune cellule texte valide trouvée."
142
 
143
+ # -------------------------------------------------
144
+ # 6. Résultat numéroté
145
+ # -------------------------------------------------
146
+ return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
147
 
148
+ # -------------------------------------------------
149
  # Interface Gradio
150
+ # -------------------------------------------------
151
  demo = gr.Interface(
152
+ fn=extract_second_column,
153
  inputs=gr.Image(type="pil", label="Image du tableau"),
154
+ outputs=gr.Textbox(label="Contenu de la colonne 2"),
155
+ title="Extraction fiable de la colonne 2 (Désignation / Description)",
156
+ description=(
157
+ "Extraction robuste de la deuxième colonne des tableaux scannés "
158
+ "(Désignation, DESIGNATIONS, Description, Description des services)."
159
+ )
160
  )
161
 
162
+ demo.launch(server_name="0.0.0.0", server_port=7860))