kebson commited on
Commit
68c9a14
·
verified ·
1 Parent(s): 8c6b76a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -124
app.py CHANGED
@@ -1,139 +1,65 @@
1
  import gradio as gr
 
 
 
2
  import numpy as np
3
- from paddleocr import PaddleOCR
4
- from sklearn.cluster import KMeans
5
 
6
- # -----------------------------
7
- # OCR
8
- # -----------------------------
9
- ocr = PaddleOCR(
10
- use_textline_orientation=True,
11
- lang="fr"
12
- )
13
-
14
- # -----------------------------
15
- # Fonction principale
16
- # -----------------------------
17
- def extract_column2_9_lines(image):
18
- if image is None:
19
- return "Aucune image fournie."
20
-
21
- img = np.array(image)
22
- result = ocr.predict(img)
23
-
24
- if not result or len(result) == 0:
25
- return "OCR exécuté mais aucun texte détecté."
26
-
27
- data = result[0]
28
- texts = data.get("rec_texts", [])
29
- boxes = data.get("dt_polys", [])
30
-
31
- if not texts:
32
- return "Aucun texte exploitable détecté."
33
-
34
- # -----------------------------
35
- # 1. Collecte OCR
36
- # -----------------------------
37
- elements = []
38
- for text, box in zip(texts, boxes):
39
- text = text.strip()
40
- if len(text) < 3:
41
- continue
42
-
43
- x_center = np.mean([p[0] for p in box])
44
- y_center = np.mean([p[1] for p in box])
45
-
46
- elements.append((x_center, y_center, text))
47
-
48
- if len(elements) < 5:
49
- return "Pas assez de texte détecté."
50
-
51
- # -----------------------------
52
- # 2. Clustering horizontal ADAPTATIF
53
- # -----------------------------
54
- X = np.array([[e[0]] for e in elements])
55
- n_clusters = min(8, max(3, len(elements) // 8))
56
 
57
- kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
58
- labels = kmeans.fit_predict(X)
59
 
60
- columns = {}
61
- for (x, y, text), label in zip(elements, labels):
62
- columns.setdefault(label, []).append((x, y, text))
63
-
64
- # -----------------------------
65
- # 3. Choisir la colonne "Description"
66
- # => la plus riche en texte non numérique
67
- # -----------------------------
68
- def column_score(col):
69
- score = 0
70
- for _, _, t in col:
71
- if not any(char.isdigit() for char in t):
72
- score += len(t)
73
- return score
74
-
75
- best_column = max(columns.values(), key=column_score)
76
-
77
- # Tri vertical
78
- best_column.sort(key=lambda e: e[1])
79
-
80
- # -----------------------------
81
- # 4. Fusion intelligente des lignes
82
- # -----------------------------
83
- merged_lines = []
84
- current_text = ""
85
- last_y = None
86
- Y_THRESHOLD = 22
87
-
88
- blacklist = (
89
- "DESIGNATION", "UNITE", "QUANT", "PRIX", "TOTAL",
90
- "LOT", "BORDEREAU", "DATE", "NB", "TTC", "HT"
91
  )
92
 
93
- for _, y, text in best_column:
94
- if text.upper().startswith(blacklist):
95
- continue
96
-
97
- if last_y is None or abs(y - last_y) > Y_THRESHOLD:
98
- if current_text:
99
- merged_lines.append(current_text.strip())
100
- current_text = text
101
- else:
102
- current_text += " " + text
103
-
104
- last_y = y
105
-
106
- if current_text:
107
- merged_lines.append(current_text.strip())
108
-
109
- # -----------------------------
110
- # 5. Nettoyage final
111
- # -----------------------------
112
- cleaned = []
113
- for line in merged_lines:
114
- if len(line) < 5:
115
- continue
116
- if sum(c.isdigit() for c in line) > len(line) / 2:
 
 
 
 
 
 
 
 
 
 
117
  continue
118
- cleaned.append(line)
119
-
120
- final_lines = cleaned[:9]
121
 
122
- if not final_lines:
123
- return "Colonne détectée mais contenu non exploitable."
124
 
125
- # Numérotation demandée
126
- return "\n".join([f"{i+1}. {l}" for i, l in enumerate(final_lines)])
127
 
128
- # -----------------------------
129
- # Interface Gradio
130
- # -----------------------------
131
  demo = gr.Interface(
132
- fn=extract_column2_9_lines,
133
- inputs=gr.Image(type="pil", label="Image du tableau"),
134
- outputs=gr.Textbox(label="Colonne Description (9 lignes)"),
135
- title="Extraction robuste de la colonne Description",
136
- description="Optimisé pour tableaux photographiés (devis, factures, bordereaux)"
137
  )
138
 
139
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from PIL import Image
3
+ import cv2
4
+ import pytesseract
5
  import numpy as np
 
 
6
 
7
+ pytesseract.pytesseract.tesseract_cmd = "tesseract"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def extract_descriptions(image: Image.Image):
10
+ img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
11
 
12
+ data = pytesseract.image_to_data(
13
+ img,
14
+ output_type=pytesseract.Output.DICT,
15
+ config="--psm 6"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
+ words = []
19
+ for i in range(len(data["text"])):
20
+ txt = data["text"][i].strip()
21
+ if txt:
22
+ words.append({
23
+ "text": txt,
24
+ "x": data["left"][i],
25
+ "y": data["top"][i],
26
+ "w": data["width"][i],
27
+ "h": data["height"][i],
28
+ })
29
+
30
+ header = next((w for w in words if w["text"].lower() == "description"), None)
31
+ if not header:
32
+ return "Colonne 'Description' non détectée"
33
+
34
+ x_min = header["x"] - 10
35
+ x_max = header["x"] + header["w"] + 350
36
+ y_min = header["y"] + header["h"] + 10
37
+
38
+ col_words = [
39
+ w for w in words
40
+ if x_min <= w["x"] <= x_max and w["y"] > y_min
41
+ ]
42
+
43
+ lines = {}
44
+ for w in col_words:
45
+ key = w["y"] // 15
46
+ lines.setdefault(key, []).append(w)
47
+
48
+ results = []
49
+ for k in sorted(lines):
50
+ line = " ".join(w["text"] for w in sorted(lines[k], key=lambda x: x["x"]))
51
+ if any(x in line.lower() for x in ["vat", "gross", "net", "each"]):
52
  continue
53
+ results.append(line)
 
 
54
 
55
+ return "\n".join(results)
 
56
 
 
 
57
 
 
 
 
58
  demo = gr.Interface(
59
+ fn=extract_descriptions,
60
+ inputs=gr.Image(type="pil"),
61
+ outputs=gr.Textbox(lines=20),
62
+ title="Extraction colonne Description Factures"
 
63
  )
64
 
65
  demo.launch(server_name="0.0.0.0", server_port=7860)