kebson commited on
Commit
fe12926
·
verified ·
1 Parent(s): d3dbd12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -112
app.py CHANGED
@@ -1,147 +1,114 @@
1
  import gradio as gr
2
  import numpy as np
3
  from paddleocr import PaddleOCR
4
- import re
5
-
6
- # -------------------------------------------------
7
- # OCR
8
- # -------------------------------------------------
9
- ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
10
-
11
- # -------------------------------------------------
12
- # RÈGLES MÉTIER
13
- # -------------------------------------------------
14
- def is_title(text):
15
- t = text.upper()
16
- return any(k in t for k in [
17
- "CADRE DE DEVIS",
18
- "LOT",
19
- "AXE",
20
- "PRIX TOTAL",
21
- "TVA",
22
- "TTC"
23
- ])
24
-
25
- def is_f_start(text):
26
- # F majuscule = début cellule
27
- # SAUF F6
28
- return text.startswith("F") and not text.startswith("F6")
29
-
30
- def is_f6(text):
31
- return text.startswith("F6")
32
-
33
- def is_continuation(text):
34
- t = text.lower().strip()
35
- return (
36
- t.startswith("avec")
37
- or t.startswith("et ")
38
- or t.startswith("y compris")
39
- or t.startswith("compris")
40
- or t.startswith("basse")
41
- or t.startswith("franchissable")
42
- or t.startswith("pour ")
43
- or t.startswith("f6")
44
- )
45
-
46
- def looks_like_text(text):
47
- return len(text) >= 4 and not re.match(r"^\d+$", text)
48
-
49
- # -------------------------------------------------
50
- # EXTRACTION PRINCIPALE
51
- # -------------------------------------------------
52
- def extract_designations(image):
53
  if image is None:
54
  return "Aucune image fournie."
55
 
56
  img = np.array(image)
57
- result = ocr.predict(img)[0]
58
 
59
- texts = result["rec_texts"]
60
- boxes = result["dt_polys"]
61
 
62
- # Tri vertical
63
- lines = []
 
 
 
64
  for text, box in zip(texts, boxes):
 
 
 
 
65
  y = np.mean([p[1] for p in box])
66
- lines.append((y, text.strip()))
67
 
68
- lines.sort(key=lambda x: x[0])
 
69
 
70
- # -----------------------------
71
- # APRÈS "DESIGNATIONS"
72
- # -----------------------------
73
- started = False
74
- cleaned = []
75
 
76
- for _, text in lines:
77
- if text.upper() == "DESIGNATIONS":
78
- started = True
79
- continue
80
 
81
- if not started:
82
- continue
 
83
 
84
- if is_title(text):
85
- continue
86
 
87
- if looks_like_text(text):
88
- cleaned.append(text)
 
 
 
 
89
 
90
- # -----------------------------
91
- # CONSTRUCTION DES CELLULES
92
- # -----------------------------
93
- cells = []
94
- current = ""
95
 
96
- for text in cleaned:
97
- # F MAJUSCULE (≠ F6) → NOUVELLE CELLULE
98
- if is_f_start(text):
99
- if current:
100
- cells.append(current.strip())
101
- current = text
102
- continue
103
 
104
- # F6 CONTINUATION FORCÉE
105
- if is_f6(text):
106
- current += " " + text
107
- continue
108
 
109
- if not current:
110
- current = text
111
- continue
 
112
 
113
- if is_continuation(text):
114
- current += " " + text
 
 
 
115
  else:
116
- # Nouvelle cellule logique (changement fort)
117
- if text[0].isupper() and len(current) > 25:
118
- cells.append(current.strip())
119
- current = text
120
- else:
121
- current += " " + text
122
 
123
  if current:
124
- cells.append(current.strip())
125
 
126
- # -----------------------------
127
- # SORTIE
128
- # -----------------------------
129
- cells = cells[:9]
 
 
 
 
 
 
 
130
 
131
- if not cells:
132
- return "Aucune désignation détectée."
133
 
134
- return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
135
 
136
- # -------------------------------------------------
137
- # INTERFACE GRADIO
138
- # -------------------------------------------------
139
  demo = gr.Interface(
140
- fn=extract_designations,
141
  inputs=gr.Image(type="pil", label="Image du tableau"),
142
- outputs=gr.Textbox(label="Colonne DESIGNATIONS (V7)"),
143
  title="Extraction fiable de la colonne DESIGNATIONS",
144
- description="Règle F majuscule respectée – F6 = continuation (cellule 7)"
145
  )
146
 
147
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import numpy as np
3
  from paddleocr import PaddleOCR
4
+ from sklearn.cluster import KMeans
5
+
6
+ ocr = PaddleOCR(
7
+ use_textline_orientation=True,
8
+ lang="fr"
9
+ )
10
+
11
+ HEADER_EXACT = "DESIGNATIONS"
12
+
13
+ def extract_column2_9_lines(image):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  if image is None:
15
  return "Aucune image fournie."
16
 
17
  img = np.array(image)
18
+ result = ocr.predict(img)
19
 
20
+ if not result:
21
+ return "Aucun texte détecté."
22
 
23
+ data = result[0]
24
+ texts = data.get("rec_texts", [])
25
+ boxes = data.get("dt_polys", [])
26
+
27
+ elements = []
28
  for text, box in zip(texts, boxes):
29
+ text = text.strip()
30
+ if len(text) < 2:
31
+ continue
32
+ x = np.mean([p[0] for p in box])
33
  y = np.mean([p[1] for p in box])
34
+ elements.append((x, y, text))
35
 
36
+ if len(elements) < 5:
37
+ return "Pas assez de données OCR."
38
 
39
+ # --- CLUSTER COLONNES ---
40
+ X = np.array([[e[0]] for e in elements])
41
+ kmeans = KMeans(n_clusters=min(7, len(elements)//6 + 2), random_state=42, n_init=10)
42
+ labels = kmeans.fit_predict(X)
 
43
 
44
+ columns = {}
45
+ for (x, y, t), lbl in zip(elements, labels):
46
+ columns.setdefault(lbl, []).append((x, y, t))
 
47
 
48
+ # --- COLONNE DESCRIPTION = max texte non numérique ---
49
+ def score(col):
50
+ return sum(len(t) for _,_,t in col if not any(c.isdigit() for c in t))
51
 
52
+ desc_col = max(columns.values(), key=score)
53
+ desc_col.sort(key=lambda e: e[1]) # top -> bottom
54
 
55
+ # --- LOCALISER L’EN-TÊTE ---
56
+ header_index = None
57
+ for i, (_, _, t) in enumerate(desc_col):
58
+ if t.upper() == HEADER_EXACT:
59
+ header_index = i
60
+ break
61
 
62
+ if header_index is None:
63
+ start_index = 0
64
+ else:
65
+ start_index = header_index + 1
 
66
 
67
+ content = desc_col[start_index:]
 
 
 
 
 
 
68
 
69
+ # --- SEUIL ADAPTATIF ---
70
+ ys = [y for _,y,_ in content]
71
+ Y_THRESHOLD = max(22, np.median(np.diff(sorted(ys))) * 1.2) if len(ys) > 1 else 30
 
72
 
73
+ # --- FUSION ---
74
+ lines = []
75
+ current = ""
76
+ last_y = None
77
 
78
+ for _, y, text in content:
79
+ if last_y is None or abs(y - last_y) > Y_THRESHOLD:
80
+ if current:
81
+ lines.append(current.strip())
82
+ current = text
83
  else:
84
+ current += " " + text
85
+ last_y = y
 
 
 
 
86
 
87
  if current:
88
+ lines.append(current.strip())
89
 
90
+ # --- NETTOYAGE ---
91
+ final = []
92
+ for i, l in enumerate(lines):
93
+ if i == 0:
94
+ final.append(l) # Toujours garder la 1ère vraie ligne
95
+ continue
96
+ if len(l) < 5:
97
+ continue
98
+ if sum(c.isdigit() for c in l) > len(l)/2:
99
+ continue
100
+ final.append(l)
101
 
102
+ final = final[:9]
 
103
 
104
+ return "\n".join([f"{i+1}. {l}" for i,l in enumerate(final)])
105
 
106
+ # --- GRADIO ---
 
 
107
  demo = gr.Interface(
108
+ fn=extract_column2_9_lines,
109
  inputs=gr.Image(type="pil", label="Image du tableau"),
110
+ outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
111
  title="Extraction fiable de la colonne DESIGNATIONS",
 
112
  )
113
 
114
  demo.launch(server_name="0.0.0.0", server_port=7860)