kebson commited on
Commit
d539c06
·
verified ·
1 Parent(s): 1032de6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -91
app.py CHANGED
@@ -2,138 +2,105 @@ import gradio as gr
2
  import numpy as np
3
  from paddleocr import PaddleOCR
4
  from sklearn.cluster import KMeans
 
 
 
5
 
6
  # -------------------------------------------------
7
- # OCR
8
  # -------------------------------------------------
9
- ocr = PaddleOCR(
10
- use_textline_orientation=True,
11
- lang="fr"
12
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # -------------------------------------------------
15
- # EXTRACTION DESIGNATIONS
16
  # -------------------------------------------------
17
- def extract_column2_9_lines(image):
18
  if image is None:
19
  return "Aucune image fournie."
20
 
21
  img = np.array(image)
22
  result = ocr.predict(img)
23
 
24
- if not result:
25
- return "OCR exécuté mais aucun texte détecté."
26
-
27
  data = result[0]
28
- texts = data.get("rec_texts", [])
29
- boxes = data.get("dt_polys", [])
30
 
31
- # -------------------------------------------------
32
- # 1. COLLECTE OCR
33
- # -------------------------------------------------
34
- elements = []
35
  for text, box in zip(texts, boxes):
36
  text = text.strip()
37
- if len(text) < 2:
38
- continue
39
-
40
- x_center = np.mean([p[0] for p in box])
41
- y_center = np.mean([p[1] for p in box])
42
-
43
- elements.append((x_center, y_center, text))
44
 
45
- if len(elements) < 6:
46
- return "Pas assez de texte exploitable."
47
 
48
- # -------------------------------------------------
49
- # 2. CLUSTERING DES COLONNES
50
- # -------------------------------------------------
51
- X = np.array([[e[0]] for e in elements])
52
- n_cols = min(7, max(3, len(elements) // 6))
53
-
54
- kmeans = KMeans(n_clusters=n_cols, random_state=42, n_init=10)
55
- labels = kmeans.fit_predict(X)
56
-
57
- columns = {}
58
- for (x, y, text), label in zip(elements, labels):
59
- columns.setdefault(label, []).append((x, y, text))
60
-
61
- # -------------------------------------------------
62
- # 3. CHOIX COLONNE DESIGNATIONS
63
- # -------------------------------------------------
64
- def column_score(col):
65
- return sum(
66
- len(t) for _, _, t in col
67
- if not any(c.isdigit() for c in t)
68
- )
69
-
70
- col = max(columns.values(), key=column_score)
71
- col.sort(key=lambda e: e[1]) # top → bottom
72
-
73
- # -------------------------------------------------
74
- # 4. SUPPRESSION DE L’EN-TÊTE
75
- # -------------------------------------------------
76
- cleaned = []
77
- header_removed = False
78
-
79
- for x, y, text in col:
80
- if not header_removed and text.upper().strip() == "DESIGNATIONS":
81
- header_removed = True
82
  continue
83
- cleaned.append((y, text))
84
 
85
  # -------------------------------------------------
86
- # 5. FUSION CELLULES (LOGIQUE AMÉLIORÉE)
87
  # -------------------------------------------------
88
- merged = []
89
  current = ""
90
- last_y = None
91
 
92
- for y, text in cleaned:
93
- new_cell = False
 
94
 
95
- if last_y is None:
96
- new_cell = True
97
- elif abs(y - last_y) > 35 and len(current) > 30 and text[0].isupper():
98
- new_cell = True
99
 
100
- if new_cell:
101
- if current:
102
- merged.append(current.strip())
 
103
  current = text
104
  else:
105
  current += " " + text
106
 
107
- last_y = y
108
-
109
  if current:
110
- merged.append(current.strip())
111
-
112
- # -------------------------------------------------
113
- # 6. NETTOYAGE FINAL
114
- # -------------------------------------------------
115
- final = []
116
- for line in merged:
117
- if sum(c.isdigit() for c in line) > len(line) * 0.45:
118
- continue
119
- final.append(line)
120
 
121
- final = final[:9]
122
 
123
- if not final:
124
- return "Aucune ligne exploitable détectée."
125
 
126
- return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final)])
127
 
128
  # -------------------------------------------------
129
  # INTERFACE
130
  # -------------------------------------------------
131
  demo = gr.Interface(
132
- fn=extract_column2_9_lines,
133
  inputs=gr.Image(type="pil", label="Image du tableau"),
134
- outputs=gr.Textbox(label="Colonne DESIGNATIONS (9 lignes)"),
135
  title="Extraction fiable de la colonne DESIGNATIONS",
136
- description="Optimisé pour devis et bordereaux photographiés"
137
  )
138
 
139
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import numpy as np
3
  from paddleocr import PaddleOCR
4
  from sklearn.cluster import KMeans
5
+ import re
6
+
7
+ ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
8
 
9
  # -------------------------------------------------
10
+ # OUTILS TEXTE
11
  # -------------------------------------------------
12
+ def is_continuation(text):
13
+ t = text.lower().strip()
14
+ return (
15
+ t.startswith("et ")
16
+ or t.startswith("avec ")
17
+ or t.startswith("y compris")
18
+ or t.startswith("compr")
19
+ )
20
+
21
+ def has_too_many_digits(text):
22
+ return sum(c.isdigit() for c in text) > len(text) * 0.4
23
+
24
+ def looks_like_designation(text):
25
+ if len(text) < 10:
26
+ return False
27
+ if has_too_many_digits(text):
28
+ return False
29
+ if re.match(r"^(m2|m3|ml|u|ff)\b", text.lower()):
30
+ return False
31
+ return True
32
 
33
  # -------------------------------------------------
34
+ # EXTRACTION
35
  # -------------------------------------------------
36
+ def extract_designations(image):
37
  if image is None:
38
  return "Aucune image fournie."
39
 
40
  img = np.array(image)
41
  result = ocr.predict(img)
42
 
 
 
 
43
  data = result[0]
44
+ texts = data["rec_texts"]
45
+ boxes = data["dt_polys"]
46
 
47
+ lines = []
 
 
 
48
  for text, box in zip(texts, boxes):
49
  text = text.strip()
50
+ y = np.mean([p[1] for p in box])
51
+ lines.append((y, text))
 
 
 
 
 
52
 
53
+ # Tri vertical
54
+ lines.sort(key=lambda x: x[0])
55
 
56
+ # Suppression en-tête
57
+ filtered = []
58
+ for y, text in lines:
59
+ if text.upper().strip() == "DESIGNATIONS":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  continue
61
+ filtered.append(text)
62
 
63
  # -------------------------------------------------
64
+ # FUSION INTELLIGENTE
65
  # -------------------------------------------------
66
+ cells = []
67
  current = ""
 
68
 
69
+ for text in filtered:
70
+ if not looks_like_designation(text):
71
+ continue
72
 
73
+ if not current:
74
+ current = text
75
+ continue
 
76
 
77
+ if is_continuation(text):
78
+ current += " " + text
79
+ elif text[0].isupper() and len(text) > 20:
80
+ cells.append(current.strip())
81
  current = text
82
  else:
83
  current += " " + text
84
 
 
 
85
  if current:
86
+ cells.append(current.strip())
 
 
 
 
 
 
 
 
 
87
 
88
+ cells = cells[:9]
89
 
90
+ if not cells:
91
+ return "Aucune désignation détectée."
92
 
93
+ return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
94
 
95
  # -------------------------------------------------
96
  # INTERFACE
97
  # -------------------------------------------------
98
  demo = gr.Interface(
99
+ fn=extract_designations,
100
  inputs=gr.Image(type="pil", label="Image du tableau"),
101
+ outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
102
  title="Extraction fiable de la colonne DESIGNATIONS",
103
+ description="Approche textuelle robuste pour devis et bordereaux"
104
  )
105
 
106
  demo.launch(server_name="0.0.0.0", server_port=7860)