kebson commited on
Commit
f7bee90
·
verified ·
1 Parent(s): 8d4694b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -44
app.py CHANGED
@@ -1,43 +1,38 @@
1
  import gradio as gr
2
  import cv2
3
  import numpy as np
4
- from PIL import Image
5
  from paddleocr import PaddleOCR
 
6
 
7
- ocr = PaddleOCR(
8
- use_angle_cls=True,
9
- lang="en"
10
- )
11
 
 
 
12
 
13
- def extract_descriptions(image: Image.Image):
 
14
  if image is None:
15
- return "Aucune image fournie."
16
 
 
17
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
18
 
 
19
  result = ocr.ocr(img)
 
 
20
 
21
  words = []
22
 
23
- for line in result[0]:
24
- text, score, box = None, None, None
25
-
26
- # Parsing défensif PaddleOCR
27
- if isinstance(line, (list, tuple)):
28
- if len(line) >= 2 and isinstance(line[1], (list, tuple)):
29
- box = line[0]
30
- text = line[1][0]
31
- score = line[1][1]
32
- elif len(line) >= 3:
33
- box = line[0]
34
- text = line[1]
35
- score = line[2]
36
-
37
- if box is None or text is None:
38
  continue
39
 
40
- # 🔒 Sécurisation du score
41
  try:
42
  score = float(score)
43
  except:
@@ -57,18 +52,18 @@ def extract_descriptions(image: Image.Image):
57
  "h": max(ys) - min(ys),
58
  })
59
 
60
- # 1️⃣ Détecter la colonne Description
61
  header = next(
62
  (w for w in words if "description" in w["text"].lower()),
63
  None
64
  )
65
 
66
  if header is None:
67
- return "❌ Colonne 'Description' non détectée."
68
 
69
- # 2️⃣ Zone de la colonne
70
- x_min = header["x"] - 15
71
- x_max = header["x"] + header["w"] + 380
72
  y_min = header["y"] + header["h"] + 10
73
 
74
  column_words = [
@@ -76,49 +71,70 @@ def extract_descriptions(image: Image.Image):
76
  if x_min <= w["x"] <= x_max and w["y"] > y_min
77
  ]
78
 
79
- # 3️⃣ Regroupement par lignes
 
 
 
80
  lines = {}
81
  for w in column_words:
82
- key = int(w["y"] // 18)
83
  lines.setdefault(key, []).append(w)
84
 
85
- raw_lines = []
86
- for k in sorted(lines):
87
  line = " ".join(
88
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
89
  )
 
90
 
 
 
 
91
  low = line.lower()
92
- if any(x in low for x in ["vat", "gross", "net", "total", "each"]):
 
93
  continue
 
94
  if line.replace(".", "").replace(",", "").isdigit():
95
  continue
96
 
97
- raw_lines.append(line)
98
 
99
- # 4️⃣ Fusion multilignes
100
- final = []
101
  buffer = ""
102
 
103
- for line in raw_lines:
104
  if line[:2].replace(".", "").isdigit():
105
  if buffer:
106
- final.append(buffer.strip())
107
  buffer = line.split(".", 1)[-1].strip()
108
  else:
109
  buffer += " " + line
110
 
111
  if buffer:
112
- final.append(buffer.strip())
 
 
 
 
 
113
 
114
- return "\n".join(final) if final else "⚠️ Aucun texte extrait."
115
 
116
 
 
117
  demo = gr.Interface(
118
- fn=extract_descriptions,
119
- inputs=gr.Image(type="pil"),
120
- outputs=gr.Textbox(lines=20),
121
- title="Extraction colonne Description PaddleOCR (Production Safe)"
 
 
 
 
 
 
122
  )
123
 
124
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import cv2
3
  import numpy as np
 
4
  from paddleocr import PaddleOCR
5
+ from PIL import Image
6
 
 
 
 
 
7
 
8
+ # ✅ Configuration la plus compatible (CPU / Hugging Face)
9
+ ocr = PaddleOCR(lang="en")
10
 
11
+
12
+ def extract_description_column(image: Image.Image):
13
  if image is None:
14
+ return "Aucune image fournie."
15
 
16
+ # Conversion image
17
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
18
 
19
+ # OCR
20
  result = ocr.ocr(img)
21
+ if not result or not result[0]:
22
+ return "❌ Aucun texte détecté."
23
 
24
  words = []
25
 
26
+ # 1️⃣ Collecte OCR
27
+ for item in result[0]:
28
+ try:
29
+ box = item[0]
30
+ text = item[1][0]
31
+ score = item[1][1]
32
+ except Exception:
 
 
 
 
 
 
 
 
33
  continue
34
 
35
+ # Sécurisation du score
36
  try:
37
  score = float(score)
38
  except:
 
52
  "h": max(ys) - min(ys),
53
  })
54
 
55
+ # 2️⃣ Détection header "Description"
56
  header = next(
57
  (w for w in words if "description" in w["text"].lower()),
58
  None
59
  )
60
 
61
  if header is None:
62
+ return "❌ Colonne 'Description' introuvable."
63
 
64
+ # 3️⃣ Zone colonne Description (adaptée facture)
65
+ x_min = header["x"] - 10
66
+ x_max = header["x"] + header["w"] + 450
67
  y_min = header["y"] + header["h"] + 10
68
 
69
  column_words = [
 
71
  if x_min <= w["x"] <= x_max and w["y"] > y_min
72
  ]
73
 
74
+ if not column_words:
75
+ return "⚠️ Aucun contenu détecté sous la colonne Description."
76
+
77
+ # 4️⃣ Regroupement par lignes visuelles
78
  lines = {}
79
  for w in column_words:
80
+ key = int(w["y"] // 20)
81
  lines.setdefault(key, []).append(w)
82
 
83
+ ordered_lines = []
84
+ for k in sorted(lines.keys()):
85
  line = " ".join(
86
  w["text"] for w in sorted(lines[k], key=lambda x: x["x"])
87
  )
88
+ ordered_lines.append(line)
89
 
90
+ # 5️⃣ Nettoyage (prix, VAT, etc.)
91
+ cleaned = []
92
+ for line in ordered_lines:
93
  low = line.lower()
94
+
95
+ if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
96
  continue
97
+
98
  if line.replace(".", "").replace(",", "").isdigit():
99
  continue
100
 
101
+ cleaned.append(line)
102
 
103
+ # 6️⃣ Fusion multilignes (cellules)
104
+ final_cells = []
105
  buffer = ""
106
 
107
+ for line in cleaned:
108
  if line[:2].replace(".", "").isdigit():
109
  if buffer:
110
+ final_cells.append(buffer.strip())
111
  buffer = line.split(".", 1)[-1].strip()
112
  else:
113
  buffer += " " + line
114
 
115
  if buffer:
116
+ final_cells.append(buffer.strip())
117
+
118
+ # Format affichage
119
+ output = ""
120
+ for i, cell in enumerate(final_cells, 1):
121
+ output += f"{i}. {cell}\n\n"
122
 
123
+ return output.strip()
124
 
125
 
126
+ # 🎛️ Interface Gradio
127
  demo = gr.Interface(
128
+ fn=extract_description_column,
129
+ inputs=gr.Image(type="pil", label="Image de facture / tableau"),
130
+ outputs=gr.Textbox(lines=18, label="Contenu de la colonne Description"),
131
+ title="Extraction de la colonne Description (PaddleOCR)",
132
+ description=(
133
+ "Upload une image de facture contenant un tableau.\n"
134
+ "L'application extrait automatiquement tous les éléments "
135
+ "de la colonne 'Description', cellule par cellule."
136
+ ),
137
+ allow_flagging="never"
138
  )
139
 
140
  demo.launch(server_name="0.0.0.0", server_port=7860)