kebson commited on
Commit
9becf7c
·
verified ·
1 Parent(s): 121216c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -137
app.py CHANGED
@@ -1,150 +1,162 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  from paddleocr import PaddleOCR
4
- from unidecode import unidecode
5
 
6
- # -----------------------------
7
- # CONFIG
8
- # -----------------------------
9
- TARGET_HEADERS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "designation",
11
  "designations",
12
  "description",
13
  "description des services"
14
- ]
15
-
16
- BLACKLIST = [
17
- "prix htva", "prix tva", "prix total",
18
- "prix generale", "total general", "tva"
19
- ]
20
-
21
- ocr = PaddleOCR(use_angle_cls=True, lang="fr")
22
-
23
- # -----------------------------
24
- # UTILS
25
- # -----------------------------
26
- def norm(txt):
27
- return unidecode(txt.lower().strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- def is_blacklisted(txt):
30
- t = norm(txt)
31
- return any(b in t for b in BLACKLIST)
32
-
33
- def starts_cell(txt):
34
- return txt and txt[0].isupper()
35
-
36
- # -----------------------------
37
- # OCR
38
- # -----------------------------
39
- def ocr_extract(image):
40
- result = ocr.ocr(image, cls=True)
41
- words = []
42
-
43
- for line in result[0]:
44
- box = line[0]
45
- text = line[1][0]
46
  x = np.mean([p[0] for p in box])
47
  y = np.mean([p[1] for p in box])
48
 
49
- words.append({
50
- "text": text.strip(),
51
- "x": x,
52
- "y": y
53
- })
54
- return words
55
-
56
- # -----------------------------
57
- # GROUP ROWS
58
- # -----------------------------
59
- def group_rows(words, tol=18):
60
- words = sorted(words, key=lambda w: w["y"])
61
- rows = []
62
-
63
- for w in words:
64
- added = False
65
- for r in rows:
66
- if abs(r[0]["y"] - w["y"]) < tol:
67
- r.append(w)
68
- added = True
69
- break
70
- if not added:
71
- rows.append([w])
72
- return rows
73
-
74
- # -----------------------------
75
- # COLUMN 2 DETECTION
76
- # -----------------------------
77
- def get_col2_x(rows):
78
- xs = sorted(w["x"] for r in rows for w in r)
79
- cols = []
80
-
81
- for x in xs:
82
- if not cols or abs(cols[-1][0] - x) > 45:
83
- cols.append([x])
84
- else:
85
- cols[-1].append(x)
86
-
87
- centers = [np.mean(c) for c in cols]
88
- return centers[1] # colonne 2
89
-
90
- # -----------------------------
91
- # CELL RECONSTRUCTION
92
- # -----------------------------
93
- def extract_cells(rows, col2_x):
94
- ordered = []
95
-
96
- for r in rows:
97
- candidate = min(r, key=lambda w: abs(w["x"] - col2_x))
98
- if abs(candidate["x"] - col2_x) < 65:
99
- ordered.append(candidate)
100
-
101
- ordered = sorted(ordered, key=lambda w: w["y"])
102
-
103
- cells = []
104
- buffer = ""
105
-
106
- for o in ordered:
107
- txt = o["text"]
108
-
109
- if starts_cell(txt):
110
- if buffer:
111
- cells.append(buffer.strip())
112
- buffer = txt
113
  else:
114
- buffer += " " + txt
115
-
116
- if buffer:
117
- cells.append(buffer.strip())
118
-
119
- return cells
120
-
121
- # -----------------------------
122
- # CLEAN
123
- # -----------------------------
124
- def clean_cells(cells):
125
- return [c for c in cells if not is_blacklisted(c)]
126
-
127
- # -----------------------------
128
- # PIPELINE
129
- # -----------------------------
130
- def extract_column(image):
131
- words = ocr_extract(image)
132
- rows = group_rows(words)
133
- col2_x = get_col2_x(rows)
134
- cells = extract_cells(rows, col2_x)
135
- cells = clean_cells(cells)
136
-
137
- return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
138
-
139
- # -----------------------------
140
- # GRADIO
141
- # -----------------------------
142
- with gr.Blocks() as demo:
143
- gr.Markdown("## Extraction fiable de la colonne 2 (PaddleOCR)")
144
- img = gr.Image(type="filepath", label="Image du tableau")
145
- out = gr.Textbox(label="Contenu colonne 2", lines=15)
146
- btn = gr.Button("Extraire")
147
-
148
- btn.click(fn=extract_column, inputs=img, outputs=out)
149
-
150
- demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
+ import unicodedata
4
  from paddleocr import PaddleOCR
5
+ from sklearn.cluster import KMeans
6
 
7
+ # -------------------------------------------------
8
+ # OCR
9
+ # -------------------------------------------------
10
+ ocr = PaddleOCR(
11
+ lang="fr",
12
+ use_textline_orientation=True
13
+ )
14
+
15
+ # -------------------------------------------------
16
+ # Normalisation texte (casse + accents)
17
+ # -------------------------------------------------
18
+ def normalize(text: str) -> str:
19
+ text = text.lower()
20
+ text = unicodedata.normalize("NFD", text)
21
+ text = "".join(c for c in text if unicodedata.category(c) != "Mn")
22
+ return " ".join(text.split())
23
+
24
+ # -------------------------------------------------
25
+ # Titres valides de la colonne 2
26
+ # -------------------------------------------------
27
+ COL_TITLES = {
28
  "designation",
29
  "designations",
30
  "description",
31
  "description des services"
32
+ }
33
+
34
+ # -------------------------------------------------
35
+ # Mots / lignes à ignorer
36
+ # -------------------------------------------------
37
+ IGNORE_KEYWORDS = {
38
+ "prix", "total", "ht", "htva", "tva",
39
+ "ttc", "general", "generale"
40
+ }
41
+
42
+ # -------------------------------------------------
43
+ # Fonction principale
44
+ # -------------------------------------------------
45
+ def extract_second_column(image):
46
+ if image is None:
47
+ return "Aucune image fournie."
48
+
49
+ img = np.array(image)
50
+ result = ocr.predict(img)
51
+
52
+ if not result:
53
+ return "OCR : aucun texte détecté."
54
+
55
+ data = result[0]
56
+ texts = data.get("rec_texts", [])
57
+ boxes = data.get("dt_polys", [])
58
+
59
+ blocks = []
60
+ for text, box in zip(texts, boxes):
61
+ t = text.strip()
62
+ if len(t) < 2:
63
+ continue
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  x = np.mean([p[0] for p in box])
66
  y = np.mean([p[1] for p in box])
67
 
68
+ blocks.append((t, x, y))
69
+
70
+ if len(blocks) < 5:
71
+ return "Pas assez de texte exploitable."
72
+
73
+ # -------------------------------------------------
74
+ # 1. Détection du X de la colonne cible via son titre
75
+ # -------------------------------------------------
76
+ col_x = None
77
+ for text, x, y in blocks:
78
+ if normalize(text) in COL_TITLES:
79
+ col_x = x
80
+ break
81
+
82
+ if col_x is None:
83
+ return "Titre de la colonne cible non détecté."
84
+
85
+ # -------------------------------------------------
86
+ # 2. Sélection des blocs proches du X détecté
87
+ # -------------------------------------------------
88
+ X_THRESHOLD = 45
89
+ column_blocks = [
90
+ (t, x, y) for t, x, y in blocks
91
+ if abs(x - col_x) < X_THRESHOLD
92
+ ]
93
+
94
+ if not column_blocks:
95
+ return "Colonne détectée mais vide."
96
+
97
+ # -------------------------------------------------
98
+ # 3. Tri vertical (haut → bas)
99
+ # -------------------------------------------------
100
+ column_blocks.sort(key=lambda e: e[2])
101
+
102
+ # -------------------------------------------------
103
+ # 4. Fusion intelligente des lignes OCR
104
+ # -------------------------------------------------
105
+ merged = []
106
+ current = ""
107
+ last_y = None
108
+ Y_THRESHOLD = 22
109
+
110
+ for text, x, y in column_blocks:
111
+ nt = normalize(text)
112
+
113
+ if any(k in nt for k in IGNORE_KEYWORDS):
114
+ continue
115
+
116
+ if last_y is None or abs(y - last_y) > Y_THRESHOLD:
117
+ if current:
118
+ merged.append(current.strip())
119
+ current = text
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
+ current += " " + text
122
+
123
+ last_y = y
124
+
125
+ if current:
126
+ merged.append(current.strip())
127
+
128
+ # -------------------------------------------------
129
+ # 5. Nettoyage final (cellules texte uniquement)
130
+ # -------------------------------------------------
131
+ final = []
132
+ for line in merged:
133
+ nt = normalize(line)
134
+ if len(nt) < 4:
135
+ continue
136
+ if sum(c.isdigit() for c in line) > len(line) / 2:
137
+ continue
138
+ final.append(line)
139
+
140
+ if not final:
141
+ return "Aucune cellule texte valide trouvée."
142
+
143
+ # -------------------------------------------------
144
+ # 6. Résultat numéroté
145
+ # -------------------------------------------------
146
+ return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
147
+
148
+ # -------------------------------------------------
149
+ # Interface Gradio
150
+ # -------------------------------------------------
151
+ demo = gr.Interface(
152
+ fn=extract_second_column,
153
+ inputs=gr.Image(type="pil", label="Image du tableau"),
154
+ outputs=gr.Textbox(label="Contenu de la colonne 2"),
155
+ title="Extraction fiable de la colonne 2 (Désignation / Description)",
156
+ description=(
157
+ "Extraction robuste de la deuxième colonne des tableaux scannés "
158
+ "(Désignation, DESIGNATIONS, Description, Description des services)."
159
+ )
160
+ )
161
+
162
+ demo.launch(server_name="0.0.0.0", server_port=7860)