shubhjo commited on
Commit
9151b9c
·
verified ·
1 Parent(s): dcb88eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -64
app.py CHANGED
@@ -34,7 +34,7 @@ if not api_key:
34
  logger.error("GOOGLE_API_KEY not set")
35
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
36
  genai.configure(api_key=api_key)
37
- model = genai.GenerativeModel("gemini-2.0-flash")
38
 
39
  # Set Tesseract path
40
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
@@ -49,28 +49,45 @@ def log_memory_usage():
49
  mem_info = process.memory_info()
50
  return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
51
 
52
- def get_file_hash(pdf_bytes):
53
- """Generate MD5 hash of PDF content."""
54
- return hashlib.md5(pdf_bytes).hexdigest()
55
 
56
  def get_text_hash(raw_text):
57
  """Generate MD5 hash of raw text."""
58
  return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
59
 
60
- async def process_page(img, page_idx):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  """Process a single PDF page with OCR."""
62
  start_time = time.time()
63
- logger.info(f"Starting OCR for page {page_idx}, {log_memory_usage()}")
64
  try:
65
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
66
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
67
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
68
- custom_config = r'--oem 1 --psm 6 -l eng+ara+hin+spa+ita+rus' # Reduced for performance
69
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
70
- logger.info(f"Completed OCR for page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
71
  return page_text + "\n"
72
  except Exception as e:
73
- logger.error(f"OCR failed for page {page_idx}: {str(e)}, {log_memory_usage()}")
74
  return ""
75
 
76
  async def process_with_gemini(filename: str, raw_text: str):
@@ -91,17 +108,14 @@ async def process_with_gemini(filename: str, raw_text: str):
91
 
92
  try:
93
  prompt = f"""
94
- You are an intelligent invoice data extractor.
95
- Given raw text from an invoice in any language,
96
- extract key business fields in the specified JSON format.
97
- Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'/'VAT'). Extract currency from symbol or acronym as well.
98
- The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'.
99
- If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
100
 
101
-
 
102
 
103
  Output JSON:
104
  {{
 
105
  "Discount_Percentage": "",
106
  "Due_Date": "",
107
  "Email_Client": "",
@@ -117,7 +131,6 @@ async def process_with_gemini(filename: str, raw_text: str):
117
  "invoice date": "",
118
  "invoice number": "",
119
  "shipping address": "",
120
- "currency": "",
121
  "total": ""
122
  }}
123
  """
@@ -150,33 +163,34 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
150
  total_start_time = time.time()
151
  logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
152
 
153
- if not file.filename.lower().endswith('.pdf'):
 
 
 
154
  fail_count += 1
155
  output_json["data"].append({
156
  "filename": file.filename,
157
- "raw_text": "",
158
- "structured_data": {"error": "File is not a PDF"},
159
- "error": "File is not a PDF"
160
  })
161
- logger.error(f"File {file.filename} is not a PDF")
162
  continue
163
 
164
- # Read PDF into memory
165
  try:
166
- pdf_start_time = time.time()
167
- pdf_bytes = await file.read()
168
- pdf_stream = io.BytesIO(pdf_bytes)
169
- file_hash = get_file_hash(pdf_bytes)
170
- logger.info(f"Read PDF {file.filename}, took {time.time() - pdf_start_time:.2f} seconds, size: {len(pdf_bytes)/1024:.2f} KB, {log_memory_usage()}")
171
  except Exception as e:
172
  fail_count += 1
173
  output_json["data"].append({
174
  "filename": file.filename,
175
- "raw_text": "",
176
- "structured_data": {"error": f"Failed to read PDF: {str(e)}"},
177
- "error": f"Failed to read PDF: {str(e)}"
178
  })
179
- logger.error(f"Failed to read PDF {file.filename}: {str(e)}, {log_memory_usage()}")
180
  continue
181
 
182
  # Check raw text cache
@@ -185,44 +199,55 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
185
  raw_text = raw_text_cache[file_hash]
186
  logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
187
  else:
188
- # Try extracting embedded text
189
- try:
190
- extract_start_time = time.time()
191
- reader = PdfReader(pdf_stream)
192
- for page in reader.pages:
193
- text = page.extract_text()
194
- if text:
195
- raw_text += text + "\n"
196
- logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
197
- except Exception as e:
198
- logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
199
-
200
- # If no embedded text, perform OCR
201
- if not raw_text.strip():
202
  try:
203
- convert_start_time = time.time()
204
- images = convert_from_bytes(pdf_bytes, poppler_path="/usr/local/bin", dpi=100)
205
- logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
 
 
 
 
 
 
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  ocr_start_time = time.time()
208
- # Sequential processing to reduce memory usage; uncomment for parallel if needed
209
- page_texts = []
210
- for i, img in enumerate(images):
211
- page_text = await process_page(img, i)
212
- page_texts.append(page_text)
213
- # tasks = [process_page(img, i) for i, img in enumerate(images)]
214
- # page_texts = await asyncio.gather(*tasks)
215
- raw_text = "".join(page_texts)
216
- logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
217
  except Exception as e:
218
  fail_count += 1
219
  output_json["data"].append({
220
  "filename": file.filename,
221
- "raw_text": "",
222
- "structured_data": {"error": f"OCR failed: {str(e)}"},
223
- "error": f"OCR failed: {str(e)}"
224
  })
225
- logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
226
  continue
227
 
228
  # Normalize text
@@ -240,14 +265,13 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
240
  success_count += 1
241
  output_json["data"].append({
242
  "filename": file.filename,
243
- "raw_text": raw_text,
244
  "structured_data": structured_data,
245
  "error": ""
246
  })
247
 
248
  logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
249
 
250
- output_json["message"] = f"Processed {len(files)} PDFs. {success_count} succeeded, {fail_count} failed."
251
  if fail_count > 0 and success_count == 0:
252
  output_json["success"] = False
253
 
 
34
  logger.error("GOOGLE_API_KEY not set")
35
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
36
  genai.configure(api_key=api_key)
37
+ model = genai.GenerativeModel("gemini-1.5-pro")
38
 
39
  # Set Tesseract path
40
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 
49
  mem_info = process.memory_info()
50
  return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
51
 
52
+ def get_file_hash(file_bytes):
53
+ """Generate MD5 hash of file content."""
54
+ return hashlib.md5(file_bytes).hexdigest()
55
 
56
  def get_text_hash(raw_text):
57
  """Generate MD5 hash of raw text."""
58
  return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
59
 
60
+ async def process_image(img_bytes, filename, idx):
61
+ """Process a single image (JPG/JPEG/PNG) with OCR."""
62
+ start_time = time.time()
63
+ logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
64
+ try:
65
+ img = Image.open(io.BytesIO(img_bytes))
66
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
67
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
68
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
69
+ custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
70
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
71
+ logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
72
+ return page_text + "\n"
73
+ except Exception as e:
74
+ logger.error(f"OCR failed for {filename} image {idx}: {str(e)}, {log_memory_usage()}")
75
+ return ""
76
+
77
+ async def process_pdf_page(img, page_idx):
78
  """Process a single PDF page with OCR."""
79
  start_time = time.time()
80
+ logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
81
  try:
82
  img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
83
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
84
  img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
85
+ custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
86
  page_text = pytesseract.image_to_string(img_pil, config=custom_config)
87
+ logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
88
  return page_text + "\n"
89
  except Exception as e:
90
+ logger.error(f"OCR failed for PDF page {page_idx}: {str(e)}, {log_memory_usage()}")
91
  return ""
92
 
93
  async def process_with_gemini(filename: str, raw_text: str):
 
108
 
109
  try:
110
  prompt = f"""
111
+ You are an intelligent invoice data extractor. Given raw text from an invoice in any language and extract key business fields in the specified JSON format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' = 'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g., USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a field is missing, include it with an empty string ("") or appropriate default (e.g., 0 for numbers).
 
 
 
 
 
112
 
113
+ Raw text:
114
+ {raw_text}
115
 
116
  Output JSON:
117
  {{
118
+ "currency": "",
119
  "Discount_Percentage": "",
120
  "Due_Date": "",
121
  "Email_Client": "",
 
131
  "invoice date": "",
132
  "invoice number": "",
133
  "shipping address": "",
 
134
  "total": ""
135
  }}
136
  """
 
163
  total_start_time = time.time()
164
  logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
165
 
166
+ # Validate file format
167
+ valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
168
+ file_ext = os.path.splitext(file.filename.lower())[1]
169
+ if file_ext not in valid_extensions:
170
  fail_count += 1
171
  output_json["data"].append({
172
  "filename": file.filename,
173
+ "structured_data": {"error": f"Unsupported file format: {file_ext}"},
174
+ "error": f"Unsupported file format: {file_ext}"
 
175
  })
176
+ logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
177
  continue
178
 
179
+ # Read file into memory
180
  try:
181
+ file_start_time = time.time()
182
+ file_bytes = await file.read()
183
+ file_stream = io.BytesIO(file_bytes)
184
+ file_hash = get_file_hash(file_bytes)
185
+ logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
186
  except Exception as e:
187
  fail_count += 1
188
  output_json["data"].append({
189
  "filename": file.filename,
190
+ "structured_data": {"error": f"Failed to read file: {str(e)}"},
191
+ "error": f"Failed to read file: {str(e)}"
 
192
  })
193
+ logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
194
  continue
195
 
196
  # Check raw text cache
 
199
  raw_text = raw_text_cache[file_hash]
200
  logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
201
  else:
202
+ if file_ext == '.pdf':
203
+ # Try extracting embedded text
 
 
 
 
 
 
 
 
 
 
 
 
204
  try:
205
+ extract_start_time = time.time()
206
+ reader = PdfReader(file_stream)
207
+ for page in reader.pages:
208
+ text = page.extract_text()
209
+ if text:
210
+ raw_text += text + "\n"
211
+ logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
212
+ except Exception as e:
213
+ logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
214
 
215
+ # If no embedded text, perform OCR
216
+ if not raw_text.strip():
217
+ try:
218
+ convert_start_time = time.time()
219
+ images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
220
+ logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
221
+
222
+ ocr_start_time = time.time()
223
+ page_texts = []
224
+ for i, img in enumerate(images):
225
+ page_text = await process_pdf_page(img, i)
226
+ page_texts.append(page_text)
227
+ raw_text = "".join(page_texts)
228
+ logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
229
+ except Exception as e:
230
+ fail_count += 1
231
+ output_json["data"].append({
232
+ "filename": file.filename,
233
+ "structured_data": {"error": f"OCR failed: {str(e)}"},
234
+ "error": f"OCR failed: {str(e)}"
235
+ })
236
+ logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
237
+ continue
238
+ else: # JPG/JPEG/PNG
239
+ try:
240
  ocr_start_time = time.time()
241
+ raw_text = await process_image(file_bytes, file.filename, 0)
242
+ logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
 
 
 
 
 
 
 
243
  except Exception as e:
244
  fail_count += 1
245
  output_json["data"].append({
246
  "filename": file.filename,
247
+ "structured_data": {"error": f"Image OCR failed: {str(e)}"},
248
+ "error": f"Image OCR failed: {str(e)}"
 
249
  })
250
+ logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
251
  continue
252
 
253
  # Normalize text
 
265
  success_count += 1
266
  output_json["data"].append({
267
  "filename": file.filename,
 
268
  "structured_data": structured_data,
269
  "error": ""
270
  })
271
 
272
  logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
273
 
274
+ output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
275
  if fail_count > 0 and success_count == 0:
276
  output_json["success"] = False
277