ChintanSatva commited on
Commit
1404125
·
verified ·
1 Parent(s): f12bf00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -47
app.py CHANGED
@@ -1,4 +1,8 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
 
 
 
 
2
  import pytesseract
3
  import cv2
4
  import os
@@ -8,7 +12,7 @@ import unicodedata
8
  from pdf2image import convert_from_bytes
9
  from pypdf import PdfReader
10
  import numpy as np
11
- from typing import List
12
  import io
13
  import logging
14
  import time
@@ -18,7 +22,22 @@ import cachetools
18
  import hashlib
19
  import google.generativeai as genai
20
  from dotenv import load_dotenv
21
- from decimal import Decimal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  app = FastAPI()
24
 
@@ -35,9 +54,10 @@ if not api_key:
35
  logger.error("GOOGLE_API_KEY not set")
36
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
37
  genai.configure(api_key=api_key)
38
- model = genai.GenerativeModel("gemini-2.5-flash")
39
 
40
- # Set Tesseract path
 
41
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
42
 
43
  # In-memory caches (1-hour TTL)
@@ -67,7 +87,7 @@ async def process_image(img_bytes, filename, idx):
67
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
68
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
69
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
70
- custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
71
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
72
  logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
73
  return page_text + "\n"
@@ -83,7 +103,7 @@ async def process_pdf_page(img, page_idx):
83
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
84
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
85
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
86
- custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
87
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
88
  logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
89
  return page_text + "\n"
@@ -96,16 +116,14 @@ async def process_with_gemini(filename: str, raw_text: str):
96
  start_time = time.time()
97
  logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")
98
 
99
- # Check structured data cache
100
  text_hash = get_text_hash(raw_text)
101
  if text_hash in structured_data_cache:
102
  logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
103
  return structured_data_cache[text_hash]
104
 
105
- # Truncate text for Gemini
106
- if len(raw_text) > 10000:
107
- raw_text = raw_text[:10000]
108
- logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
109
 
110
  try:
111
  prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
@@ -117,6 +135,11 @@ async def process_with_gemini(filename: str, raw_text: str):
117
  - The 'items' list may have multiple entries, each with detailed attributes.
118
  - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
119
  - Convert any date found in format : YYYY-MM-DD
 
 
 
 
 
120
 
121
  Raw text:
122
  {raw_text}
@@ -189,10 +212,18 @@ Output JSON:
189
  json_start = llm_output.find("{")
190
  json_end = llm_output.rfind("}") + 1
191
  json_str = llm_output[json_start:json_end]
 
 
 
 
 
192
  structured_data = json.loads(json_str, parse_float=Decimal)
 
 
193
  structured_data_cache[text_hash] = structured_data
194
  logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
195
- print("Structured Data", structured_data)
 
196
  return structured_data
197
  except Exception as e:
198
  logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
@@ -200,7 +231,7 @@ Output JSON:
200
 
201
  @app.post("/ocr")
202
  async def extract_and_structure(files: List[UploadFile] = File(...)):
203
- output_json = {
204
  "success": True,
205
  "message": "",
206
  "data": []
@@ -214,12 +245,11 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
214
  total_start_time = time.time()
215
  logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
216
 
217
- # Validate file format
218
  valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
219
  file_ext = os.path.splitext(file.filename.lower())[1]
220
  if file_ext not in valid_extensions:
221
  fail_count += 1
222
- output_json["data"].append({
223
  "filename": file.filename,
224
  "structured_data": {"error": f"Unsupported file format: {file_ext}"},
225
  "error": f"Unsupported file format: {file_ext}"
@@ -227,7 +257,6 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
227
  logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
228
  continue
229
 
230
- # Read file into memory
231
  try:
232
  file_start_time = time.time()
233
  file_bytes = await file.read()
@@ -236,7 +265,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
236
  logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
237
  except Exception as e:
238
  fail_count += 1
239
- output_json["data"].append({
240
  "filename": file.filename,
241
  "structured_data": {"error": f"Failed to read file: {str(e)}"},
242
  "error": f"Failed to read file: {str(e)}"
@@ -244,14 +273,12 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
244
  logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
245
  continue
246
 
247
- # Check raw text cache
248
  raw_text = ""
249
  if file_hash in raw_text_cache:
250
  raw_text = raw_text_cache[file_hash]
251
  logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
252
  else:
253
  if file_ext == '.pdf':
254
- # Try extracting embedded text
255
  try:
256
  extract_start_time = time.time()
257
  reader = PdfReader(file_stream)
@@ -263,68 +290,67 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
263
  except Exception as e:
264
  logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
265
 
266
- # If no embedded text, perform OCR
267
  if not raw_text.strip():
268
  try:
269
  convert_start_time = time.time()
270
- images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
271
  logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
272
 
273
- ocr_start_time = time.time()
274
- page_texts = []
275
- for i, img in enumerate(images):
276
- page_text = await process_pdf_page(img, i)
277
- page_texts.append(page_text)
278
  raw_text = "".join(page_texts)
279
- logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
280
  except Exception as e:
281
  fail_count += 1
282
- output_json["data"].append({
283
  "filename": file.filename,
284
  "structured_data": {"error": f"OCR failed: {str(e)}"},
285
  "error": f"OCR failed: {str(e)}"
286
  })
287
  logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
288
  continue
289
- else: # JPG/JPEG/PNG
290
  try:
291
- ocr_start_time = time.time()
292
  raw_text = await process_image(file_bytes, file.filename, 0)
293
- logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
294
  except Exception as e:
295
  fail_count += 1
296
- output_json["data"].append({
297
  "filename": file.filename,
298
  "structured_data": {"error": f"Image OCR failed: {str(e)}"},
299
  "error": f"Image OCR failed: {str(e)}"
300
  })
301
  logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
302
  continue
303
-
304
- # Normalize text
305
- try:
306
- normalize_start_time = time.time()
307
  raw_text = unicodedata.normalize('NFKC', raw_text)
308
- raw_text = raw_text.encode().decode('utf-8')
309
  raw_text_cache[file_hash] = raw_text
310
- logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
311
- except Exception as e:
312
- logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
313
 
314
- # Process with Gemini
315
  structured_data = await process_with_gemini(file.filename, raw_text)
316
- success_count += 1
317
- output_json["data"].append({
 
 
 
 
318
  "filename": file.filename,
319
  "structured_data": structured_data,
320
- "error": ""
321
  })
322
 
323
  logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
324
 
325
- output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
326
  if fail_count > 0 and success_count == 0:
327
- output_json["success"] = False
328
 
329
  logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
330
- return output_json
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ # Import Decimal and the custom JSONResponse
3
+ from decimal import Decimal
4
+ from fastapi.encoders import jsonable_encoder
5
+ from starlette.responses import JSONResponse
6
  import pytesseract
7
  import cv2
8
  import os
 
12
  from pdf2image import convert_from_bytes
13
  from pypdf import PdfReader
14
  import numpy as np
15
+ from typing import List, Any
16
  import io
17
  import logging
18
  import time
 
22
  import hashlib
23
  import google.generativeai as genai
24
  from dotenv import load_dotenv
25
+
26
+ # --- START OF MODIFICATIONS ---
27
+
28
+ # 1. Define a custom JSON encoder function
29
+ # This function checks if an object is of type Decimal. If it is, it converts it
30
+ # to a string to preserve its exact formatting. Otherwise, it lets the default
31
+ # encoder handle it. This prevents FastAPI from converting Decimals back to floats.
32
+ def custom_encoder(obj: Any) -> Any:
33
+ if isinstance(obj, Decimal):
34
+ # By converting to a string, we ensure "0.0000009" is not turned into 9e-7
35
+ return str(obj)
36
+ # For any other type, fall back to the default encoder
37
+ return jsonable_encoder(obj)
38
+
39
+ # --- END OF MODIFICATIONS ---
40
+
41
 
42
  app = FastAPI()
43
 
 
54
  logger.error("GOOGLE_API_KEY not set")
55
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
56
  genai.configure(api_key=api_key)
57
+ model = genai.GenerativeModel("gemini-1.5-pro-latest") # Using a recommended model
58
 
59
+ # Set Tesseract path (adjust if necessary)
60
+ # For Docker/Linux, this is often the correct path. For Windows/macOS, it will differ.
61
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
62
 
63
  # In-memory caches (1-hour TTL)
 
87
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
88
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
89
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
90
+ custom_config = r'--oem 1 --psm 6 -l eng+ara'
91
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
92
  logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
93
  return page_text + "\n"
 
103
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
104
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
105
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
106
+ custom_config = r'--oem 1 --psm 6 -l eng+ara'
107
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
108
  logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
109
  return page_text + "\n"
 
116
  start_time = time.time()
117
  logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")
118
 
 
119
  text_hash = get_text_hash(raw_text)
120
  if text_hash in structured_data_cache:
121
  logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
122
  return structured_data_cache[text_hash]
123
 
124
+ if len(raw_text) > 20000: # Increased limit slightly
125
+ raw_text = raw_text[:20000]
126
+ logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
 
127
 
128
  try:
129
  prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
 
135
  - The 'items' list may have multiple entries, each with detailed attributes.
136
  - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
137
  - Convert any date found in format : YYYY-MM-DD
138
+ - For Unit Price Format all numbers in standard decimal notation without exponents:
139
+ - Correct: 0.0000009, 1500000, 0.00123
140
+ - Incorrect: 9e-7, 1.5e+6, 1.23e-3
141
+
142
+ This applies to all calculations, measurements, and numeric results in your response.
143
 
144
  Raw text:
145
  {raw_text}
 
212
  json_start = llm_output.find("{")
213
  json_end = llm_output.rfind("}") + 1
214
  json_str = llm_output[json_start:json_end]
215
+
216
+ # --- START OF MODIFICATIONS ---
217
+ # 2. Use `parse_float=Decimal` when loading the JSON string.
218
+ # This converts numbers like 0.0000009 into a high-precision Decimal
219
+ # object inside Python, instead of a standard float.
220
  structured_data = json.loads(json_str, parse_float=Decimal)
221
+ # --- END OF MODIFICATIONS ---
222
+
223
  structured_data_cache[text_hash] = structured_data
224
  logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
225
+ # This will now print Decimal('0.0000009') in your console, which is correct
226
+ logger.info(f"Structured Data: {structured_data}")
227
  return structured_data
228
  except Exception as e:
229
  logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
 
231
 
232
  @app.post("/ocr")
233
  async def extract_and_structure(files: List[UploadFile] = File(...)):
234
+ output_data = {
235
  "success": True,
236
  "message": "",
237
  "data": []
 
245
  total_start_time = time.time()
246
  logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
247
 
 
248
  valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
249
  file_ext = os.path.splitext(file.filename.lower())[1]
250
  if file_ext not in valid_extensions:
251
  fail_count += 1
252
+ output_data["data"].append({
253
  "filename": file.filename,
254
  "structured_data": {"error": f"Unsupported file format: {file_ext}"},
255
  "error": f"Unsupported file format: {file_ext}"
 
257
  logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
258
  continue
259
 
 
260
  try:
261
  file_start_time = time.time()
262
  file_bytes = await file.read()
 
265
  logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
266
  except Exception as e:
267
  fail_count += 1
268
+ output_data["data"].append({
269
  "filename": file.filename,
270
  "structured_data": {"error": f"Failed to read file: {str(e)}"},
271
  "error": f"Failed to read file: {str(e)}"
 
273
  logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
274
  continue
275
 
 
276
  raw_text = ""
277
  if file_hash in raw_text_cache:
278
  raw_text = raw_text_cache[file_hash]
279
  logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
280
  else:
281
  if file_ext == '.pdf':
 
282
  try:
283
  extract_start_time = time.time()
284
  reader = PdfReader(file_stream)
 
290
  except Exception as e:
291
  logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
292
 
 
293
  if not raw_text.strip():
294
  try:
295
  convert_start_time = time.time()
296
+ images = convert_from_bytes(file_bytes, dpi=150) # Use 150 dpi as a balance
297
  logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
298
 
299
+ ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
300
+ page_texts = await asyncio.gather(*ocr_tasks)
 
 
 
301
  raw_text = "".join(page_texts)
302
+ logger.info(f"Total OCR for {file.filename}, text length: {len(raw_text)}, {log_memory_usage()}")
303
  except Exception as e:
304
  fail_count += 1
305
+ output_data["data"].append({
306
  "filename": file.filename,
307
  "structured_data": {"error": f"OCR failed: {str(e)}"},
308
  "error": f"OCR failed: {str(e)}"
309
  })
310
  logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
311
  continue
312
+ else:
313
  try:
 
314
  raw_text = await process_image(file_bytes, file.filename, 0)
315
+ logger.info(f"Image OCR for {file.filename}, text length: {len(raw_text)}, {log_memory_usage()}")
316
  except Exception as e:
317
  fail_count += 1
318
+ output_data["data"].append({
319
  "filename": file.filename,
320
  "structured_data": {"error": f"Image OCR failed: {str(e)}"},
321
  "error": f"Image OCR failed: {str(e)}"
322
  })
323
  logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
324
  continue
325
+
326
+ if raw_text:
 
 
327
  raw_text = unicodedata.normalize('NFKC', raw_text)
 
328
  raw_text_cache[file_hash] = raw_text
 
 
 
329
 
 
330
  structured_data = await process_with_gemini(file.filename, raw_text)
331
+ if "error" not in structured_data:
332
+ success_count += 1
333
+ else:
334
+ fail_count += 1
335
+
336
+ output_data["data"].append({
337
  "filename": file.filename,
338
  "structured_data": structured_data,
339
+ "error": structured_data.get("error", "")
340
  })
341
 
342
  logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
343
 
344
+ output_data["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
345
  if fail_count > 0 and success_count == 0:
346
+ output_data["success"] = False
347
 
348
  logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
349
+
350
+ # --- START OF MODIFICATIONS ---
351
+ # 3. Use the custom encoder when returning the final JSON response.
352
+ # This ensures the Decimal objects are converted to strings correctly.
353
+ # The default JSONResponse will not handle Decimal types properly.
354
+ encoded_data = custom_encoder(output_data)
355
+ return JSONResponse(content=encoded_data)
356
+ # --- END OF MODIFICATIONS ---