kebson commited on
Commit
7e77d30
·
verified ·
1 Parent(s): e175021

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -114
app.py CHANGED
@@ -1,150 +1,139 @@
1
  import gradio as gr
2
  import numpy as np
3
- import unicodedata
4
  from paddleocr import PaddleOCR
 
5
 
6
- # -------------------------------------------------
7
- # OCR (CONFIG STABLE POUR HUGGING FACE)
8
- # -------------------------------------------------
9
  ocr = PaddleOCR(
10
- lang="fr",
11
- use_angle_cls=False, # ⛔ désactivation orientation
12
- show_log=False # silence logs
13
  )
14
 
15
- # -------------------------------------------------
16
- # Normalisation texte
17
- # -------------------------------------------------
18
- def normalize(text: str) -> str:
19
- text = text.lower()
20
- text = unicodedata.normalize("NFD", text)
21
- text = "".join(c for c in text if unicodedata.category(c) != "Mn")
22
- return " ".join(text.split())
23
-
24
- # -------------------------------------------------
25
- # Titres colonne 2
26
- # -------------------------------------------------
27
- COL_TITLES = {
28
- "designation",
29
- "designations",
30
- "description",
31
- "description des services"
32
- }
33
-
34
- # -------------------------------------------------
35
- # Mots à ignorer
36
- # -------------------------------------------------
37
- IGNORE_KEYWORDS = {
38
- "prix", "total", "ht", "htva", "tva",
39
- "ttc", "general", "generale"
40
- }
41
-
42
- # -------------------------------------------------
43
- # Extraction colonne 2
44
- # -------------------------------------------------
45
- def extract_second_column(image):
46
  if image is None:
47
  return "Aucune image fournie."
48
 
49
  img = np.array(image)
50
- result = ocr.ocr(img, cls=False)
51
 
52
- if not result or not result[0]:
53
- return "OCR : aucun texte détecté."
54
 
55
- blocks = []
56
- for line in result[0]:
57
- text = line[1][0].strip()
58
- box = line[0]
59
 
60
- if len(text) < 2:
 
 
 
 
 
 
 
 
 
61
  continue
62
 
63
- x = np.mean([p[0] for p in box])
64
- y = np.mean([p[1] for p in box])
65
- blocks.append((text, x, y))
66
-
67
- # -------------------------------------------------
68
- # 1. Trouver le titre
69
- # -------------------------------------------------
70
- col_x, title_y = None, None
71
- for text, x, y in blocks:
72
- if normalize(text) in COL_TITLES:
73
- col_x, title_y = x, y
74
- break
75
-
76
- if col_x is None:
77
- return "Titre de la colonne non détecté."
78
-
79
- # -------------------------------------------------
80
- # 2. Filtrage par X + sous le titre
81
- # -------------------------------------------------
82
- column_blocks = [
83
- (t, x, y) for t, x, y in blocks
84
- if abs(x - col_x) < 50 and y > title_y + 15
85
- ]
86
-
87
- column_blocks.sort(key=lambda e: e[2])
88
-
89
- # -------------------------------------------------
90
- # 3. Fusion contrôlée
91
- # -------------------------------------------------
92
- merged = []
93
- current = ""
 
 
 
 
 
 
 
 
 
 
 
94
  last_y = None
 
95
 
96
- for text, x, y in column_blocks:
97
- nt = normalize(text)
 
 
98
 
99
- if any(k in nt for k in IGNORE_KEYWORDS):
 
100
  continue
101
 
102
- new_cell = (
103
- last_y is None
104
- or abs(y - last_y) > 35
105
- or text[0].isupper()
106
- )
107
-
108
- if new_cell:
109
- if current:
110
- merged.append(current.strip())
111
- current = text
112
  else:
113
- current += " " + text
114
 
115
  last_y = y
116
 
117
- if current:
118
- merged.append(current.strip())
119
 
120
- # -------------------------------------------------
121
- # 4. Nettoyage final
122
- # -------------------------------------------------
123
- final = []
124
- for line in merged:
125
- if not line[0].isupper():
126
  continue
127
- if sum(c.isdigit() for c in line) > len(line) * 0.4:
128
  continue
129
- final.append(line)
 
 
130
 
131
- if not final:
132
- return "Aucune cellule valide trouvée."
133
 
134
- return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
 
135
 
136
- # -------------------------------------------------
137
  # Interface Gradio
138
- # -------------------------------------------------
139
  demo = gr.Interface(
140
- fn=extract_second_column,
141
  inputs=gr.Image(type="pil", label="Image du tableau"),
142
- outputs=gr.Textbox(label="Contenu colonne 2"),
143
- title="Extraction colonne Désignation / Description"
 
144
  )
145
 
146
- demo.launch(
147
- server_name="0.0.0.0",
148
- server_port=7860,
149
- ssr_mode=False
150
- )
 
1
  import gradio as gr
2
  import numpy as np
 
3
  from paddleocr import PaddleOCR
4
+ from sklearn.cluster import KMeans
5
 
6
+ # -----------------------------
7
+ # OCR
8
+ # -----------------------------
9
  ocr = PaddleOCR(
10
+ use_textline_orientation=True,
11
+ lang="fr"
 
12
  )
13
 
14
+ # -----------------------------
15
+ # Fonction principale
16
+ # -----------------------------
17
+ def extract_column2_9_lines(image):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if image is None:
19
  return "Aucune image fournie."
20
 
21
  img = np.array(image)
22
+ result = ocr.predict(img)
23
 
24
+ if not result or len(result) == 0:
25
+ return "OCR exécuté mais aucun texte détecté."
26
 
27
+ data = result[0]
28
+ texts = data.get("rec_texts", [])
29
+ boxes = data.get("dt_polys", [])
 
30
 
31
+ if not texts:
32
+ return "Aucun texte exploitable détecté."
33
+
34
+ # -----------------------------
35
+ # 1. Collecte OCR
36
+ # -----------------------------
37
+ elements = []
38
+ for text, box in zip(texts, boxes):
39
+ text = text.strip()
40
+ if len(text) < 3:
41
  continue
42
 
43
+ x_center = np.mean([p[0] for p in box])
44
+ y_center = np.mean([p[1] for p in box])
45
+
46
+ elements.append((x_center, y_center, text))
47
+
48
+ if len(elements) < 5:
49
+ return "Pas assez de texte détecté."
50
+
51
+ # -----------------------------
52
+ # 2. Clustering horizontal ADAPTATIF
53
+ # -----------------------------
54
+ X = np.array([[e[0]] for e in elements])
55
+ n_clusters = min(8, max(3, len(elements) // 8))
56
+
57
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
58
+ labels = kmeans.fit_predict(X)
59
+
60
+ columns = {}
61
+ for (x, y, text), label in zip(elements, labels):
62
+ columns.setdefault(label, []).append((x, y, text))
63
+
64
+ # -----------------------------
65
+ # 3. Choisir la colonne "Description"
66
+ # => la plus riche en texte non numérique
67
+ # -----------------------------
68
+ def column_score(col):
69
+ score = 0
70
+ for _, _, t in col:
71
+ if not any(char.isdigit() for char in t):
72
+ score += len(t)
73
+ return score
74
+
75
+ best_column = max(columns.values(), key=column_score)
76
+
77
+ # Tri vertical
78
+ best_column.sort(key=lambda e: e[1])
79
+
80
+ # -----------------------------
81
+ # 4. Fusion intelligente des lignes
82
+ # -----------------------------
83
+ merged_lines = []
84
+ current_text = ""
85
  last_y = None
86
+ Y_THRESHOLD = 22
87
 
88
+ blacklist = (
89
+ "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
90
+ "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
91
+ )
92
 
93
+ for _, y, text in best_column:
94
+ if text.upper().startswith(blacklist):
95
  continue
96
 
97
+ if last_y is None or abs(y - last_y) > Y_THRESHOLD:
98
+ if current_text:
99
+ merged_lines.append(current_text.strip())
100
+ current_text = text
 
 
 
 
 
 
101
  else:
102
+ current_text += " " + text
103
 
104
  last_y = y
105
 
106
+ if current_text:
107
+ merged_lines.append(current_text.strip())
108
 
109
+ # -----------------------------
110
+ # 5. Nettoyage final
111
+ # -----------------------------
112
+ cleaned = []
113
+ for line in merged_lines:
114
+ if len(line) < 5:
115
  continue
116
+ if sum(c.isdigit() for c in line) > len(line) / 2:
117
  continue
118
+ cleaned.append(line)
119
+
120
+ final_lines = cleaned[:9]
121
 
122
+ if not final_lines:
123
+ return "Colonne détectée mais contenu non exploitable."
124
 
125
+ # Numérotation demandée
126
+ return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
127
 
128
+ # -----------------------------
129
  # Interface Gradio
130
+ # -----------------------------
131
  demo = gr.Interface(
132
+ fn=extract_column2_9_lines,
133
  inputs=gr.Image(type="pil", label="Image du tableau"),
134
+ outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
135
+ title="Extraction robuste de la colonne Description",
136
+ description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
137
  )
138
 
139
+ demo.launch(server_name="0.0.0.0", server_port=7860)