kebson commited on
Commit
c794a72
·
verified ·
1 Parent(s): 3cdcef1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -67
app.py CHANGED
@@ -3,9 +3,12 @@ import cv2
3
  import numpy as np
4
  from paddleocr import PaddleOCR
5
  from PIL import Image
 
6
 
 
 
 
7
 
8
- # ✅ Configuration la plus compatible (CPU / Hugging Face)
9
  ocr = PaddleOCR(lang="en")
10
 
11
 
@@ -13,128 +16,131 @@ def extract_description_column(image: Image.Image):
13
  if image is None:
14
  return "❌ Aucune image fournie."
15
 
16
- # Conversion image
17
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
18
 
19
- # OCR
20
- result = ocr.ocr(img)
21
- if not result or not result[0]:
22
  return "❌ Aucun texte détecté."
23
 
24
  words = []
25
 
26
- # 1️⃣ Collecte OCR
27
- for item in result[0]:
28
- try:
29
- box = item[0]
30
- text = item[1][0]
31
- score = item[1][1]
32
- except Exception:
33
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Sécurisation du score
36
- try:
37
- score = float(score)
38
- except:
39
- score = 1.0
40
 
41
- if score < 0.5 or not str(text).strip():
42
- continue
43
 
44
- xs = [p[0] for p in box]
45
- ys = [p[1] for p in box]
46
-
47
- words.append({
48
- "text": str(text).strip(),
49
- "x": min(xs),
50
- "y": min(ys),
51
- "w": max(xs) - min(xs),
52
- "h": max(ys) - min(ys),
53
- })
54
-
55
- # 2️⃣ Détection header "Description"
56
- header = next(
57
- (w for w in words if "description" in w["text"].lower()),
58
- None
59
- )
60
 
61
- if header is None:
62
- return "❌ Colonne 'Description' introuvable."
 
 
 
 
63
 
64
- # 3️⃣ Zone colonne Description (adaptée facture)
65
- x_min = header["x"] - 10
66
- x_max = header["x"] + header["w"] + 450
67
- y_min = header["y"] + header["h"] + 10
68
 
 
 
 
69
  column_words = [
70
  w for w in words
71
- if x_min <= w["x"] <= x_max and w["y"] > y_min
72
  ]
73
 
74
  if not column_words:
75
- return "⚠️ Aucun contenu détecté sous la colonne Description."
76
 
 
77
  # 4️⃣ Regroupement par lignes visuelles
 
78
  lines = {}
79
  for w in column_words:
80
- key = int(w["y"] // 20)
81
  lines.setdefault(key, []).append(w)
82
 
83
  ordered_lines = []
84
- for k in sorted(lines.keys()):
85
  line = " ".join(
86
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
87
  )
88
  ordered_lines.append(line)
89
 
90
- # 5️⃣ Nettoyage (prix, VAT, etc.)
 
 
91
  cleaned = []
92
  for line in ordered_lines:
93
  low = line.lower()
94
-
95
- if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
96
  continue
97
-
98
  if line.replace(".", "").replace(",", "").isdigit():
99
  continue
100
-
101
  cleaned.append(line)
102
 
103
- # 6️⃣ Fusion multilignes (cellules)
104
- final_cells = []
 
 
105
  buffer = ""
106
 
107
  for line in cleaned:
108
  if line[:2].replace(".", "").isdigit():
109
  if buffer:
110
- final_cells.append(buffer.strip())
111
  buffer = line.split(".", 1)[-1].strip()
112
  else:
113
  buffer += " " + line
114
 
115
  if buffer:
116
- final_cells.append(buffer.strip())
117
 
118
- # Format affichage
 
 
119
  output = ""
120
- for i, cell in enumerate(final_cells, 1):
121
  output += f"{i}. {cell}\n\n"
122
 
123
  return output.strip()
124
 
125
 
126
- # 🎛️ Interface Gradio
127
  demo = gr.Interface(
128
  fn=extract_description_column,
129
- inputs=gr.Image(type="pil", label="Image de facture / tableau"),
130
- outputs=gr.Textbox(lines=18, label="Contenu de la colonne Description"),
131
- title="Extraction de la colonne Description (PaddleOCR)",
132
- description=(
133
- "Upload une image de facture contenant un tableau.\n"
134
- "L'application extrait automatiquement tous les éléments "
135
- "de la colonne 'Description', cellule par cellule."
136
- ),
137
-
138
  )
139
 
140
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import numpy as np
4
  from paddleocr import PaddleOCR
5
  from PIL import Image
6
+ import os
7
 
8
+ # Sécurité HF
9
+ os.environ["OMP_NUM_THREADS"] = "1"
10
+ os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
11
 
 
12
  ocr = PaddleOCR(lang="en")
13
 
14
 
 
16
  if image is None:
17
  return "❌ Aucune image fournie."
18
 
 
19
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
20
 
21
+ result = ocr.ocr(img, cls=False)
22
+ if not result:
 
23
  return "❌ Aucun texte détecté."
24
 
25
  words = []
26
 
27
+ # OCR blocs normalisés
28
+ for line in result:
29
+ for item in line:
30
+ box, (text, score) = item
31
+ if not text.strip():
32
+ continue
33
+
34
+ xs = [p[0] for p in box]
35
+ ys = [p[1] for p in box]
36
+
37
+ words.append({
38
+ "text": text.strip(),
39
+ "x": min(xs),
40
+ "y": min(ys),
41
+ "w": max(xs) - min(xs),
42
+ "h": max(ys) - min(ys),
43
+ })
44
+
45
+ if not words:
46
+ return "❌ OCR vide."
47
+
48
+ # ----------------------------
49
+ # 1️⃣ Détection ligne header (celle avec No / Description / Qty)
50
+ # ----------------------------
51
+ header_y = min(
52
+ w["y"] for w in words
53
+ if any(k in w["text"].lower() for k in ["no", "qty", "description"])
54
+ )
55
 
56
+ header_words = [w for w in words if abs(w["y"] - header_y) < 15]
 
 
 
 
57
 
58
+ header_words = sorted(header_words, key=lambda x: x["x"])
 
59
 
60
+ if len(header_words) < 3:
61
+ return "❌ Header du tableau non détecté."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ # ----------------------------
64
+ # 2️⃣ Colonne Description = entre No. et Qty
65
+ # ----------------------------
66
+ # No. → colonne 1
67
+ # Description → colonne 2
68
+ # Qty → colonne 3
69
 
70
+ x_min = header_words[1]["x"] - 10
71
+ x_max = header_words[2]["x"] - 10
 
 
72
 
73
+ # ----------------------------
74
+ # 3️⃣ Mots sous la colonne
75
+ # ----------------------------
76
  column_words = [
77
  w for w in words
78
+ if x_min <= w["x"] <= x_max and w["y"] > header_y + 20
79
  ]
80
 
81
  if not column_words:
82
+ return "⚠️ Aucun texte trouvé dans la colonne Description."
83
 
84
+ # ----------------------------
85
  # 4️⃣ Regroupement par lignes visuelles
86
+ # ----------------------------
87
  lines = {}
88
  for w in column_words:
89
+ key = int(w["y"] // 18)
90
  lines.setdefault(key, []).append(w)
91
 
92
  ordered_lines = []
93
+ for k in sorted(lines):
94
  line = " ".join(
95
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
96
  )
97
  ordered_lines.append(line)
98
 
99
+ # ----------------------------
100
+ # 5️⃣ Nettoyage (prix / VAT / unités)
101
+ # ----------------------------
102
  cleaned = []
103
  for line in ordered_lines:
104
  low = line.lower()
105
+ if any(x in low for x in ["vat", "each", "%"]):
 
106
  continue
 
107
  if line.replace(".", "").replace(",", "").isdigit():
108
  continue
 
109
  cleaned.append(line)
110
 
111
+ # ----------------------------
112
+ # 6️⃣ Fusion multi-lignes (cellules)
113
+ # ----------------------------
114
+ cells = []
115
  buffer = ""
116
 
117
  for line in cleaned:
118
  if line[:2].replace(".", "").isdigit():
119
  if buffer:
120
+ cells.append(buffer.strip())
121
  buffer = line.split(".", 1)[-1].strip()
122
  else:
123
  buffer += " " + line
124
 
125
  if buffer:
126
+ cells.append(buffer.strip())
127
 
128
+ # ----------------------------
129
+ # Résultat final
130
+ # ----------------------------
131
  output = ""
132
+ for i, cell in enumerate(cells, 1):
133
  output += f"{i}. {cell}\n\n"
134
 
135
  return output.strip()
136
 
137
 
 
138
  demo = gr.Interface(
139
  fn=extract_description_column,
140
+ inputs=gr.Image(type="pil", label="Image de facture"),
141
+ outputs=gr.Textbox(lines=18, label="Colonne Description"),
142
+ title="Extraction colonne Description PaddleOCR",
143
+ description="Extraction robuste de la 2ᵉ colonne (Description) des factures."
 
 
 
 
 
144
  )
145
 
146
  demo.launch(server_name="0.0.0.0", server_port=7860)