kebson commited on
Commit
cef6308
·
verified ·
1 Parent(s): e76b20c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -76
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import gradio as gr
2
  import cv2
3
  import numpy as np
@@ -5,7 +9,11 @@ from paddleocr import PaddleOCR
5
  from PIL import Image
6
 
7
 
8
- ocr = PaddleOCR(lang="en")
 
 
 
 
9
 
10
 
11
  def extract_description_column(image: Image.Image):
@@ -14,99 +22,76 @@ def extract_description_column(image: Image.Image):
14
 
15
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
16
 
17
- result = ocr.ocr(img, cls=False)
18
- if not result:
19
  return "❌ Aucun texte détecté."
20
 
21
  words = []
22
 
23
- # OCR → blocs normalisés
24
- for line in result:
25
- for item in line:
26
- box, (text, score) = item
27
- if not text.strip():
28
- continue
29
-
30
- xs = [p[0] for p in box]
31
- ys = [p[1] for p in box]
32
-
33
- words.append({
34
- "text": text.strip(),
35
- "x": min(xs),
36
- "y": min(ys),
37
- "w": max(xs) - min(xs),
38
- "h": max(ys) - min(ys),
39
- })
40
-
41
- if not words:
42
- return " OCR vide."
43
-
44
- # ----------------------------
45
- # 1️⃣ Détection ligne header (celle avec No / Description / Qty)
46
- # ----------------------------
47
- header_y = min(
48
- w["y"] for w in words
49
- if any(k in w["text"].lower() for k in ["no", "qty", "description"])
50
- )
51
-
52
- header_words = [w for w in words if abs(w["y"] - header_y) < 15]
53
-
54
- header_words = sorted(header_words, key=lambda x: x["x"])
55
-
56
- if len(header_words) < 3:
57
- return "❌ Header du tableau non détecté."
58
-
59
- # ----------------------------
60
- # 2️⃣ Colonne Description = entre No. et Qty
61
- # ----------------------------
62
- # No. → colonne 1
63
- # Description → colonne 2
64
- # Qty → colonne 3
65
-
66
- x_min = header_words[1]["x"] - 10
67
- x_max = header_words[2]["x"] - 10
68
-
69
- # ----------------------------
70
- # 3️⃣ Mots sous la colonne
71
- # ----------------------------
72
- column_words = [
73
  w for w in words
74
- if x_min <= w["x"] <= x_max and w["y"] > header_y + 20
75
  ]
76
 
77
- if not column_words:
78
- return "⚠️ Aucun texte trouvé dans la colonne Description."
79
 
80
- # ----------------------------
81
- # 4️⃣ Regroupement par lignes visuelles
82
- # ----------------------------
83
  lines = {}
84
- for w in column_words:
85
- key = int(w["y"] // 18)
86
  lines.setdefault(key, []).append(w)
87
 
88
  ordered_lines = []
89
- for k in sorted(lines):
90
  line = " ".join(
91
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
92
  )
93
  ordered_lines.append(line)
94
 
95
- # ----------------------------
96
- # 5️⃣ Nettoyage (prix / VAT / unités)
97
- # ----------------------------
98
  cleaned = []
99
  for line in ordered_lines:
100
  low = line.lower()
101
- if any(x in low for x in ["vat", "each", "%"]):
102
- continue
103
- if line.replace(".", "").replace(",", "").isdigit():
104
  continue
105
  cleaned.append(line)
106
 
107
- # ----------------------------
108
- # 6️⃣ Fusion multi-lignes (cellules)
109
- # ----------------------------
110
  cells = []
111
  buffer = ""
112
 
@@ -121,9 +106,7 @@ def extract_description_column(image: Image.Image):
121
  if buffer:
122
  cells.append(buffer.strip())
123
 
124
- # ----------------------------
125
- # Résultat final
126
- # ----------------------------
127
  output = ""
128
  for i, cell in enumerate(cells, 1):
129
  output += f"{i}. {cell}\n\n"
@@ -135,8 +118,8 @@ demo = gr.Interface(
135
  fn=extract_description_column,
136
  inputs=gr.Image(type="pil", label="Image de facture"),
137
  outputs=gr.Textbox(lines=18, label="Colonne Description"),
138
- title="Extraction colonne Description – PaddleOCR",
139
- description="Extraction robuste de la 2ᵉ colonne (Description) des factures."
140
  )
141
 
142
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
+ os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
4
+
5
  import gradio as gr
6
  import cv2
7
  import numpy as np
 
9
  from PIL import Image
10
 
11
 
12
+ ocr = PaddleOCR(
13
+ lang="en",
14
+ use_gpu=False,
15
+ show_log=False
16
+ )
17
 
18
 
19
  def extract_description_column(image: Image.Image):
 
22
 
23
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
24
 
25
+ result = ocr.ocr(img)
26
+ if not result or not result[0]:
27
  return "❌ Aucun texte détecté."
28
 
29
  words = []
30
 
31
+ # 1️⃣ OCR → mots avec positions
32
+ for item in result[0]:
33
+ box, (text, score) = item
34
+ try:
35
+ score = float(score)
36
+ except:
37
+ score = 1.0
38
+
39
+ if score < 0.4 or not text.strip():
40
+ continue
41
+
42
+ xs = [p[0] for p in box]
43
+ ys = [p[1] for p in box]
44
+
45
+ words.append({
46
+ "text": text.strip(),
47
+ "x": min(xs),
48
+ "y": min(ys),
49
+ "w": max(xs) - min(xs),
50
+ "h": max(ys) - min(ys),
51
+ })
52
+
53
+ # 2️⃣ Détection colonnes No / Qty / UM
54
+ no_col = [w for w in words if w["text"].lower().startswith("no")]
55
+ qty_col = [w for w in words if "qty" in w["text"].lower()]
56
+
57
+ if not no_col or not qty_col:
58
+ return "❌ Structure de tableau non reconnue."
59
+
60
+ x_left = min(w["x"] for w in no_col) + 40
61
+ x_right = min(w["x"] for w in qty_col) - 10
62
+ y_start = min(w["y"] for w in no_col) + 40
63
+
64
+ # 3️⃣ Extraction zone Description
65
+ desc_words = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  w for w in words
67
+ if x_left <= w["x"] <= x_right and w["y"] > y_start
68
  ]
69
 
70
+ if not desc_words:
71
+ return "⚠️ Aucun texte détecté dans la colonne Description."
72
 
73
+ # 4️⃣ Regroupement par lignes
 
 
74
  lines = {}
75
+ for w in desc_words:
76
+ key = int(w["y"] // 25)
77
  lines.setdefault(key, []).append(w)
78
 
79
  ordered_lines = []
80
+ for k in sorted(lines.keys()):
81
  line = " ".join(
82
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
83
  )
84
  ordered_lines.append(line)
85
 
86
+ # 5️⃣ Nettoyage
 
 
87
  cleaned = []
88
  for line in ordered_lines:
89
  low = line.lower()
90
+ if any(x in low for x in ["each", "vat", "net", "gross", "%"]):
 
 
91
  continue
92
  cleaned.append(line)
93
 
94
+ # 6️⃣ Fusion cellules multilignes
 
 
95
  cells = []
96
  buffer = ""
97
 
 
106
  if buffer:
107
  cells.append(buffer.strip())
108
 
109
+ # 7️⃣ Format sortie
 
 
110
  output = ""
111
  for i, cell in enumerate(cells, 1):
112
  output += f"{i}. {cell}\n\n"
 
118
  fn=extract_description_column,
119
  inputs=gr.Image(type="pil", label="Image de facture"),
120
  outputs=gr.Textbox(lines=18, label="Colonne Description"),
121
+ title="Extraction colonne Description – Factures",
122
+ description="Extraction automatique et robuste de la colonne Description"
123
  )
124
 
125
  demo.launch(server_name="0.0.0.0", server_port=7860)