kebson commited on
Commit
9bbca4f
·
verified ·
1 Parent(s): ec72508

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -29
app.py CHANGED
@@ -9,9 +9,7 @@ from paddleocr import PaddleOCR
9
  from PIL import Image
10
 
11
 
12
- ocr = PaddleOCR(
13
- lang="en"
14
- )
15
 
16
 
17
  def extract_description_column(image: Image.Image):
@@ -19,14 +17,14 @@ def extract_description_column(image: Image.Image):
19
  return "❌ Aucune image fournie."
20
 
21
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
22
-
23
  result = ocr.ocr(img)
 
24
  if not result or not result[0]:
25
  return "❌ Aucun texte détecté."
26
 
27
  words = []
28
 
29
- # 1️⃣ OCR → mots avec positions
30
  for item in result[0]:
31
  box, (text, score) = item
32
  try:
@@ -48,29 +46,44 @@ def extract_description_column(image: Image.Image):
48
  "h": max(ys) - min(ys),
49
  })
50
 
51
- # 2️⃣ Détection colonnes No / Qty / UM
52
- no_col = [w for w in words if w["text"].lower().startswith("no")]
53
- qty_col = [w for w in words if "qty" in w["text"].lower()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- if not no_col or not qty_col:
56
- return " Structure de tableau non reconnue."
 
57
 
58
- x_left = min(w["x"] for w in no_col) + 40
59
- x_right = min(w["x"] for w in qty_col) - 10
60
- y_start = min(w["y"] for w in no_col) + 40
61
 
62
- # 3️⃣ Extraction zone Description
63
- desc_words = [
64
- w for w in words
65
- if x_left <= w["x"] <= x_right and w["y"] > y_start
66
- ]
67
 
68
- if not desc_words:
69
- return "⚠️ Aucun texte détecté dans la colonne Description."
70
 
71
- # 4️⃣ Regroupement par lignes
72
  lines = {}
73
- for w in desc_words:
74
  key = int(w["y"] // 25)
75
  lines.setdefault(key, []).append(w)
76
 
@@ -81,15 +94,15 @@ def extract_description_column(image: Image.Image):
81
  )
82
  ordered_lines.append(line)
83
 
84
- # 5️⃣ Nettoyage
85
  cleaned = []
86
  for line in ordered_lines:
87
  low = line.lower()
88
- if any(x in low for x in ["each", "vat", "net", "gross", "%"]):
89
  continue
90
  cleaned.append(line)
91
 
92
- # 6️⃣ Fusion cellules multilignes
93
  cells = []
94
  buffer = ""
95
 
@@ -104,7 +117,7 @@ def extract_description_column(image: Image.Image):
104
  if buffer:
105
  cells.append(buffer.strip())
106
 
107
- # 7️⃣ Format sortie
108
  output = ""
109
  for i, cell in enumerate(cells, 1):
110
  output += f"{i}. {cell}\n\n"
@@ -115,9 +128,9 @@ def extract_description_column(image: Image.Image):
115
  demo = gr.Interface(
116
  fn=extract_description_column,
117
  inputs=gr.Image(type="pil", label="Image de facture"),
118
- outputs=gr.Textbox(lines=18, label="Colonne Description"),
119
- title="Extraction colonne Description – Factures",
120
- description="Extraction automatique et robuste de la colonne Description"
121
  )
122
 
123
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
9
  from PIL import Image
10
 
11
 
12
+ ocr = PaddleOCR(lang="en", use_gpu=False, show_log=False)
 
 
13
 
14
 
15
  def extract_description_column(image: Image.Image):
 
17
  return "❌ Aucune image fournie."
18
 
19
  img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 
20
  result = ocr.ocr(img)
21
+
22
  if not result or not result[0]:
23
  return "❌ Aucun texte détecté."
24
 
25
  words = []
26
 
27
+ # 1️⃣ OCR words
28
  for item in result[0]:
29
  box, (text, score) = item
30
  try:
 
46
  "h": max(ys) - min(ys),
47
  })
48
 
49
+ # 2️⃣ Trouver le début du tableau ("ITEMS")
50
+ table_start_y = None
51
+ for w in words:
52
+ if "item" in w["text"].lower():
53
+ table_start_y = w["y"]
54
+ break
55
+
56
+ if table_start_y is None:
57
+ table_start_y = 0 # fallback
58
+
59
+ table_words = [w for w in words if w["y"] > table_start_y + 30]
60
+
61
+ # 3️⃣ Regrouper par colonnes X
62
+ columns = {}
63
+ for w in table_words:
64
+ col_key = int(w["x"] // 50)
65
+ columns.setdefault(col_key, []).append(w)
66
+
67
+ # 4️⃣ Identifier la colonne Description
68
+ best_col = None
69
+ best_score = 0
70
 
71
+ for col in columns.values():
72
+ text_len = sum(len(w["text"]) for w in col)
73
+ numeric_ratio = sum(any(c.isdigit() for c in w["text"]) for w in col) / max(len(col), 1)
74
 
75
+ score = text_len * (1 - numeric_ratio)
 
 
76
 
77
+ if score > best_score:
78
+ best_score = score
79
+ best_col = col
 
 
80
 
81
+ if best_col is None:
82
+ return " Impossible d’identifier la colonne Description."
83
 
84
+ # 5️⃣ Regrouper par lignes
85
  lines = {}
86
+ for w in best_col:
87
  key = int(w["y"] // 25)
88
  lines.setdefault(key, []).append(w)
89
 
 
94
  )
95
  ordered_lines.append(line)
96
 
97
+ # 6️⃣ Nettoyage
98
  cleaned = []
99
  for line in ordered_lines:
100
  low = line.lower()
101
+ if any(x in low for x in ["vat", "net", "gross", "each", "%"]):
102
  continue
103
  cleaned.append(line)
104
 
105
+ # 7️⃣ Fusion multilignes
106
  cells = []
107
  buffer = ""
108
 
 
117
  if buffer:
118
  cells.append(buffer.strip())
119
 
120
+ # 8️⃣ Sortie
121
  output = ""
122
  for i, cell in enumerate(cells, 1):
123
  output += f"{i}. {cell}\n\n"
 
128
  demo = gr.Interface(
129
  fn=extract_description_column,
130
  inputs=gr.Image(type="pil", label="Image de facture"),
131
+ outputs=gr.Textbox(lines=20, label="Colonne Description"),
132
+ title="Extraction robuste de la colonne Description",
133
+ description="Fonctionne sans dépendre des headers OCR"
134
  )
135
 
136
  demo.launch(server_name="0.0.0.0", server_port=7860)