Vladt-Tempest commited on
Commit
2741781
·
1 Parent(s): d8be19f

commercial_invoice working for one invoice image

Browse files
__pycache__/Get_Bounding_Boxes.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
commercial_invoice.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from PIL import Image
3
+ import pytesseract
4
+
5
+ def load_field_areas(coordinates_json):
6
+ """Carga y procesa las coordenadas desde el archivo JSON"""
7
+ with open(coordinates_json, 'r') as f:
8
+ data = json.load(f)
9
+
10
+ field_areas = {}
11
+ for box in data['boxes']:
12
+ x = float(box['x'])
13
+ y = float(box['y'])
14
+ width = float(box['width'])
15
+ height = float(box['height'])
16
+
17
+ field_areas[box['label']] = {
18
+ "x1": int(x - width/2),
19
+ "y1": int(y - height/2),
20
+ "x2": int(x + width/2),
21
+ "y2": int(y + height/2)
22
+ }
23
+ return field_areas, data['width'], data['height']
24
+
25
+ def extract_text_from_area(image, area, margin=10):
26
+ """Extrae texto de un área específica de la imagen con margen de tolerancia"""
27
+ # Aplicar margen a las coordenadas
28
+ x1 = max(0, area["x1"] - margin)
29
+ y1 = max(0, area["y1"] - margin)
30
+ x2 = min(image.width, area["x2"] + margin)
31
+ y2 = min(image.height, area["y2"] + margin)
32
+
33
+ # Recortar la imagen al área especificada
34
+ crop = image.crop((x1, y1, x2, y2))
35
+
36
+ # Configurar parámetros de OCR para mejor precisión
37
+ custom_config = r'--oem 3 --psm 6'
38
+ text = pytesseract.image_to_string(crop, lang='eng', config=custom_config).strip()
39
+ return text
40
+
41
+ def process_invoice(image_path, coordinates_json, margin=10):
42
+ """Procesa la factura y extrae los campos con margen de tolerancia"""
43
+ # Cargar imagen
44
+ image = Image.open(image_path)
45
+
46
+ # Cargar áreas de los campos
47
+ field_areas, img_width, img_height = load_field_areas(coordinates_json)
48
+
49
+ # Ajustar imagen si es necesario
50
+ if image.size != (img_width, img_height):
51
+ image = image.resize((img_width, img_height))
52
+
53
+ # Extraer texto de cada área
54
+ extracted_fields = {}
55
+ for label, area in field_areas.items():
56
+ text = extract_text_from_area(image, area, margin)
57
+ if text:
58
+ extracted_fields[label] = text
59
+ else:
60
+ # Si no se encuentra texto, intentar con un margen mayor
61
+ text = extract_text_from_area(image, area, margin * 2)
62
+ if text:
63
+ extracted_fields[label] = text
64
+
65
+ return extracted_fields
66
+
67
+ if __name__ == "__main__":
68
+ # Rutas de archivos
69
+ image_path = "./invoices/pagina_9.jpg"
70
+ coordinates_json = "./coordinates_CI.json"
71
+
72
+ # Procesar factura
73
+ results = process_invoice(image_path, coordinates_json, margin=10)
74
+
75
+ # Imprimir resultados
76
+ print("\nCampos encontrados:")
77
+ for field, value in results.items():
78
+ print(f"{field}: {value}")
coordinates_CI.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boxes": [
3
+ {
4
+ "id": "1",
5
+ "label": "Farm",
6
+ "x": "950.00",
7
+ "y": "158.33",
8
+ "width": "1040.00",
9
+ "height": "50.00",
10
+ "confidence": null
11
+ },
12
+ {
13
+ "id": "2",
14
+ "label": "Farm_ruc",
15
+ "x": "623.33",
16
+ "y": "220.00",
17
+ "width": "380.00",
18
+ "height": "66.67",
19
+ "confidence": null
20
+ },
21
+ {
22
+ "id": "3",
23
+ "label": "Farm_address",
24
+ "x": "946.67",
25
+ "y": "286.67",
26
+ "width": "1040.00",
27
+ "height": "53.33",
28
+ "confidence": null
29
+ },
30
+ {
31
+ "id": "4",
32
+ "label": "Farm_city_country",
33
+ "x": "870.00",
34
+ "y": "385.00",
35
+ "width": "873.33",
36
+ "height": "50.00",
37
+ "confidence": null
38
+ },
39
+ {
40
+ "id": "5",
41
+ "label": "Farm_phone",
42
+ "x": "583.33",
43
+ "y": "451.67",
44
+ "width": "306.67",
45
+ "height": "63.33",
46
+ "confidence": null
47
+ },
48
+ {
49
+ "id": "6",
50
+ "label": "Client",
51
+ "x": "951.67",
52
+ "y": "631.67",
53
+ "width": "1056.67",
54
+ "height": "50.00",
55
+ "confidence": null
56
+ },
57
+ {
58
+ "id": "7",
59
+ "label": "Client_address",
60
+ "x": "953.33",
61
+ "y": "696.67",
62
+ "width": "1066.67",
63
+ "height": "60.00",
64
+ "confidence": null
65
+ },
66
+ {
67
+ "id": "8",
68
+ "label": "Client_city_country",
69
+ "x": "621.67",
70
+ "y": "761.67",
71
+ "width": "390.00",
72
+ "height": "56.67",
73
+ "confidence": null
74
+ },
75
+ {
76
+ "id": "9",
77
+ "label": "Client_phone",
78
+ "x": "618.33",
79
+ "y": "828.33",
80
+ "width": "390.00",
81
+ "height": "56.67",
82
+ "confidence": null
83
+ },
84
+ {
85
+ "id": "A",
86
+ "label": "invoice_date",
87
+ "x": "1846.67",
88
+ "y": "151.67",
89
+ "width": "220.00",
90
+ "height": "50.00",
91
+ "confidence": null
92
+ },
93
+ {
94
+ "id": "B",
95
+ "label": "invoice_number",
96
+ "x": "1868.33",
97
+ "y": "255.00",
98
+ "width": "223.33",
99
+ "height": "50.00",
100
+ "confidence": null
101
+ },
102
+ {
103
+ "id": "C",
104
+ "label": "invoice_country",
105
+ "x": "1840.00",
106
+ "y": "308.33",
107
+ "width": "186.67",
108
+ "height": "36.67",
109
+ "confidence": null
110
+ },
111
+ {
112
+ "id": "D",
113
+ "label": "Fue",
114
+ "x": "1970.00",
115
+ "y": "401.67",
116
+ "width": "460.00",
117
+ "height": "50.00",
118
+ "confidence": null
119
+ },
120
+ {
121
+ "id": "E",
122
+ "label": "Air_way_bill",
123
+ "x": "1991.67",
124
+ "y": "453.33",
125
+ "width": "330.00",
126
+ "height": "40.00",
127
+ "confidence": null
128
+ },
129
+ {
130
+ "id": "F",
131
+ "label": "House_Air_way_bill",
132
+ "x": "2061.67",
133
+ "y": "505.00",
134
+ "width": "250.00",
135
+ "height": "43.33",
136
+ "confidence": null
137
+ },
138
+ {
139
+ "id": "G",
140
+ "label": "Airline",
141
+ "x": "1963.33",
142
+ "y": "553.33",
143
+ "width": "466.67",
144
+ "height": "46.67",
145
+ "confidence": null
146
+ },
147
+ {
148
+ "id": "H",
149
+ "label": "Boxes_01",
150
+ "x": "210.00",
151
+ "y": "1048.33",
152
+ "width": "180.00",
153
+ "height": "56.67",
154
+ "confidence": null
155
+ },
156
+ {
157
+ "id": "I",
158
+ "label": "Pieces_01",
159
+ "x": "448.33",
160
+ "y": "1050.00",
161
+ "width": "256.67",
162
+ "height": "53.33",
163
+ "confidence": null
164
+ },
165
+ {
166
+ "id": "J",
167
+ "label": "Product_01",
168
+ "x": "881.67",
169
+ "y": "1048.33",
170
+ "width": "536.67",
171
+ "height": "56.67",
172
+ "confidence": null
173
+ },
174
+ {
175
+ "id": "K",
176
+ "label": "Tariff_number_01",
177
+ "x": "1318.33",
178
+ "y": "1053.33",
179
+ "width": "283.33",
180
+ "height": "60.00",
181
+ "confidence": null
182
+ },
183
+ {
184
+ "id": "L",
185
+ "label": "Stems_01",
186
+ "x": "1775.00",
187
+ "y": "1048.33",
188
+ "width": "130.00",
189
+ "height": "56.67",
190
+ "confidence": null
191
+ },
192
+ {
193
+ "id": "M",
194
+ "label": "Unit_price_01",
195
+ "x": "1980.00",
196
+ "y": "1048.33",
197
+ "width": "186.67",
198
+ "height": "50.00",
199
+ "confidence": null
200
+ },
201
+ {
202
+ "id": "N",
203
+ "label": "Extended_price_01",
204
+ "x": "2210.00",
205
+ "y": "1051.67",
206
+ "width": "206.67",
207
+ "height": "50.00",
208
+ "confidence": null
209
+ },
210
+ {
211
+ "id": "O",
212
+ "label": "Boxes_02",
213
+ "x": "210.00",
214
+ "y": "1108.33",
215
+ "width": "186.67",
216
+ "height": "43.33",
217
+ "confidence": null
218
+ },
219
+ {
220
+ "id": "P",
221
+ "label": "Pieces_02",
222
+ "x": "446.67",
223
+ "y": "1108.33",
224
+ "width": "260.00",
225
+ "height": "50.00",
226
+ "confidence": null
227
+ },
228
+ {
229
+ "id": "Q",
230
+ "label": "Product_02",
231
+ "x": "886.67",
232
+ "y": "1103.33",
233
+ "width": "540.00",
234
+ "height": "46.67",
235
+ "confidence": null
236
+ },
237
+ {
238
+ "id": "R",
239
+ "label": "Tariff_number_02",
240
+ "x": "1316.67",
241
+ "y": "1110.00",
242
+ "width": "273.33",
243
+ "height": "46.67",
244
+ "confidence": null
245
+ },
246
+ {
247
+ "id": "S",
248
+ "label": "Stems_02",
249
+ "x": "1781.67",
250
+ "y": "1106.67",
251
+ "width": "143.33",
252
+ "height": "46.67",
253
+ "confidence": null
254
+ },
255
+ {
256
+ "id": "T",
257
+ "label": "Unit_price_02",
258
+ "x": "1978.33",
259
+ "y": "1105.00",
260
+ "width": "196.67",
261
+ "height": "50.00",
262
+ "confidence": null
263
+ },
264
+ {
265
+ "id": "U",
266
+ "label": "Extended_price_02",
267
+ "x": "2211.67",
268
+ "y": "1106.67",
269
+ "width": "216.67",
270
+ "height": "40.00",
271
+ "confidence": null
272
+ },
273
+ {
274
+ "id": "V",
275
+ "label": "Boxes_03",
276
+ "x": "208.33",
277
+ "y": "1161.67",
278
+ "width": "183.33",
279
+ "height": "50.00",
280
+ "confidence": null
281
+ },
282
+ {
283
+ "id": "W",
284
+ "label": "Pieces_03",
285
+ "x": "446.67",
286
+ "y": "1165.00",
287
+ "width": "260.00",
288
+ "height": "56.67",
289
+ "confidence": null
290
+ },
291
+ {
292
+ "id": "X",
293
+ "label": "Product_03",
294
+ "x": "888.33",
295
+ "y": "1161.67",
296
+ "width": "543.33",
297
+ "height": "50.00",
298
+ "confidence": null
299
+ },
300
+ {
301
+ "id": "Y",
302
+ "label": "Tarif_number_03",
303
+ "x": "1318.33",
304
+ "y": "1166.67",
305
+ "width": "270.00",
306
+ "height": "53.33",
307
+ "confidence": null
308
+ },
309
+ {
310
+ "id": "Z",
311
+ "label": "Stems_03",
312
+ "x": "1781.67",
313
+ "y": "1158.33",
314
+ "width": "143.33",
315
+ "height": "50.00",
316
+ "confidence": null
317
+ },
318
+ {
319
+ "id": "a",
320
+ "label": "Unit_price_03",
321
+ "x": "1985.00",
322
+ "y": "1158.33",
323
+ "width": "203.33",
324
+ "height": "50.00",
325
+ "confidence": null
326
+ },
327
+ {
328
+ "id": "b",
329
+ "label": "Extended_price_03",
330
+ "x": "2216.67",
331
+ "y": "1158.33",
332
+ "width": "226.67",
333
+ "height": "43.33",
334
+ "confidence": null
335
+ },
336
+ {
337
+ "id": "c",
338
+ "label": "Forwarder",
339
+ "x": "1786.67",
340
+ "y": "1486.67",
341
+ "width": "1086.67",
342
+ "height": "426.67",
343
+ "confidence": null
344
+ }
345
+ ],
346
+ "height": 3509,
347
+ "key": "pagina_1.jpg",
348
+ "width": 2480
349
+ }
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  gradio
2
  pytesseract
3
- pdf2image
 
1
  gradio
2
  pytesseract
3
+ pdf2image