ChintanSatva commited on
Commit
fa77ec8
·
verified ·
1 Parent(s): a58bf1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -130
app.py CHANGED
@@ -1,5 +1,4 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
- # Import Decimal and the custom JSONResponse
3
  from decimal import Decimal, InvalidOperation
4
  from fastapi.encoders import jsonable_encoder
5
  from starlette.responses import JSONResponse
@@ -27,39 +26,34 @@ from dotenv import load_dotenv
27
  # --- START OF MODIFICATIONS ---
28
 
29
  # 1. Define a custom JSON encoder function
30
- # This function checks if an object is of type Decimal. If it is, it converts it
31
- # to a string to preserve its exact formatting. Otherwise, it lets the default
32
- # encoder handle it. This prevents FastAPI from converting Decimals back to floats.
33
  def custom_encoder(obj: Any) -> Any:
34
  if isinstance(obj, Decimal):
35
- result = str(obj)
36
- # Final safety check: if somehow scientific notation still exists, fix it
37
- if 'E' in result or 'e' in result:
38
- try:
39
- float_val = float(obj)
40
- if abs(float_val) == 0:
41
- result = "0"
42
- elif 0 < abs(float_val) < 1e-6:
43
- result = f"{float_val:.15f}".rstrip('0').rstrip('.')
44
- else:
45
- result = f"{float_val:.10f}".rstrip('0').rstrip('.')
46
- except:
47
- pass
48
- return result
49
- # For any other type, fall back to the default encoder
 
50
  return jsonable_encoder(obj)
51
 
52
  def custom_decimal_parser(s):
53
  """
54
  Custom parser that ensures numbers are converted to decimal format.
55
- This completely bypasses the scientific notation issue.
56
  """
57
  try:
58
- # Convert to float first to get the actual value
59
  float_val = float(s)
60
-
61
  if float_val == 0:
62
- return Decimal('0')
63
  elif 0 < abs(float_val) < 1e-10:
64
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
65
  elif 0 < abs(float_val) < 1e-6:
@@ -68,144 +62,125 @@ def custom_decimal_parser(s):
68
  formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
69
  else:
70
  formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
71
-
72
  return Decimal(formatted)
73
- except:
74
- return Decimal(s)
75
 
76
  def fix_scientific_notation_in_json(json_str):
77
  """
78
  Fix scientific notation in JSON string before parsing.
79
- This is the most aggressive approach - fix it at the string level.
80
  """
81
  def replace_scientific(match):
82
  try:
83
- # Extract the full scientific notation number
84
  scientific_num = match.group(0)
85
- # Convert to float then to decimal format
86
  float_val = float(scientific_num)
87
-
88
  if float_val == 0:
89
  return "0.0"
90
  elif 0 < abs(float_val) < 1e-10:
91
- return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0"
92
  elif 0 < abs(float_val) < 1e-6:
93
- return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
94
  elif abs(float_val) < 1:
95
- return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
96
  else:
97
- return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0"
98
  except Exception as e:
99
- print(f"Error converting {match.group(0)}: {e}")
100
- return match.group(0) # Return original if conversion fails
101
-
102
- # More comprehensive patterns to catch different scientific notation formats
103
  patterns = [
104
- r'-?\d+\.?\d*[eE][+-]?\d+', # Standard: 1.5e-7, 9E-7
105
- r'-?\d+[eE][+-]?\d+', # Without decimal: 9e-7
106
- r'-?\d+\.\d+[eE][+-]?\d+', # With decimal: 1.5e-7
107
  ]
108
-
109
  original_json = json_str
110
  for pattern in patterns:
111
  json_str = re.sub(pattern, replace_scientific, json_str)
112
-
113
- # Also handle cases where scientific notation might be in quoted strings
114
- # This catches cases like "value": "1.5e-7" (quoted scientific notation)
115
  def replace_quoted_scientific(match):
116
- full_match = match.group(0) # "1.5e-7"
117
- number_part = match.group(1) # 1.5e-7
118
  try:
119
  float_val = float(number_part)
120
  if 0 < abs(float_val) < 1e-6:
121
- converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
122
  else:
123
- converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
124
  return f'"{converted}"'
125
  except:
126
  return full_match
127
-
128
- # Pattern for quoted scientific notation
129
  quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
130
  json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
131
-
132
  if original_json != json_str:
133
- print(f"JSON transformation occurred")
134
- print(f"Original: {original_json[:200]}...")
135
- print(f"Fixed: {json_str[:200]}...")
136
-
137
  return json_str
138
 
139
  def convert_scientific_decimals(obj):
140
  """
141
- Recursively convert any Decimal objects in scientific notation to decimal notation.
142
- This handles cases where json.loads(parse_float=Decimal) creates Decimal('9E-7')
143
- instead of the desired Decimal('0.0000009').
144
  """
145
  if isinstance(obj, dict):
146
  return {k: convert_scientific_decimals(v) for k, v in obj.items()}
147
  elif isinstance(obj, list):
148
  return [convert_scientific_decimals(item) for item in obj]
149
  elif isinstance(obj, Decimal):
150
- # Always convert to proper decimal format, regardless of current format
151
  try:
152
- # Convert to float to get actual numeric value
153
  float_val = float(obj)
154
-
155
- # Handle special cases
156
  if float_val == 0:
157
- return Decimal('0')
158
-
159
- # For very small positive numbers, use high precision
160
- if 0 < abs(float_val) < 1e-10:
161
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
162
  elif 0 < abs(float_val) < 1e-6:
163
  formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
164
- elif 0 < abs(float_val) < 1:
165
  formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
166
  elif abs(float_val) < 1000000:
167
  formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
168
  else:
169
- # For large numbers, don't use decimal places if they're all zeros
170
- if float_val == int(float_val):
171
- formatted = str(int(float_val))
172
- else:
173
- formatted = f"{float_val:.2f}".rstrip('0').rstrip('.')
174
-
175
- # Ensure we have at least some decimal representation for very small numbers
176
  if formatted == '0' and float_val != 0:
177
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
178
-
179
  return Decimal(formatted)
180
  except (ValueError, OverflowError, InvalidOperation):
181
- # If conversion fails, return the original
182
  return obj
183
  else:
184
  return obj
185
 
186
  def force_decimal_format(data):
187
  """
188
- Additional layer to ensure all numeric values are in proper decimal format.
189
- This is applied right before JSON encoding.
190
  """
191
  if isinstance(data, dict):
192
  result = {}
193
  for key, value in data.items():
194
  if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
195
- 'sub_total', 'tax_total', 'discount_total', 'total_amount', 'tax_rate'] and isinstance(value, dict) and 'value' in value:
196
- # Special handling for numeric fields
197
- if isinstance(value['value'], (Decimal, float, int)) and value['value'] != 0:
198
- try:
199
- float_val = float(value['value'])
200
- if 0 < abs(float_val) < 1e-6:
201
- # Very small numbers - use high precision
202
- decimal_str = f"{float_val:.15f}".rstrip('0').rstrip('.')
203
- else:
204
- decimal_str = f"{float_val:.10f}".rstrip('0').rstrip('.')
205
- value['value'] = Decimal(decimal_str)
206
- except:
207
- pass
208
- result[key] = force_decimal_format(value)
 
 
 
 
 
 
 
 
209
  return result
210
  elif isinstance(data, list):
211
  return [force_decimal_format(item) for item in data]
@@ -214,7 +189,6 @@ def force_decimal_format(data):
214
 
215
  # --- END OF MODIFICATIONS ---
216
 
217
-
218
  app = FastAPI()
219
 
220
  # Configure logging
@@ -230,13 +204,12 @@ if not api_key:
230
  logger.error("GOOGLE_API_KEY not set")
231
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
232
  genai.configure(api_key=api_key)
233
- model = genai.GenerativeModel("gemini-2.0-flash") # Using a recommended model
234
 
235
- # Set Tesseract path (adjust if necessary)
236
- # For Docker/Linux, this is often the correct path. For Windows/macOS, it will differ.
237
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
238
 
239
- # In-memory caches (1-hour TTL)
240
  raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
241
  structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
242
 
@@ -255,7 +228,7 @@ def get_text_hash(raw_text):
255
  return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
256
 
257
  async def process_image(img_bytes, filename, idx):
258
- """Process a single image (JPG/JPEG/PNG) with OCR."""
259
  start_time = time.time()
260
  logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
261
  try:
@@ -297,7 +270,7 @@ async def process_with_gemini(filename: str, raw_text: str):
297
  logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
298
  return structured_data_cache[text_hash]
299
 
300
- if len(raw_text) > 20000: # Increased limit slightly
301
  raw_text = raw_text[:20000]
302
  logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
303
 
@@ -311,7 +284,6 @@ async def process_with_gemini(filename: str, raw_text: str):
311
  - The 'items' list may have multiple entries, each with detailed attributes.
312
  - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
313
  - Convert any date found in format: YYYY-MM-DD
314
-
315
  CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
316
  - CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
317
  - ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
@@ -319,7 +291,6 @@ CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use sc
319
  - For large numbers like 1500000, you MUST write out all the digits: 1500000
320
  - This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
321
  - Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
322
-
323
  Raw text:
324
  {raw_text}
325
 
@@ -388,33 +359,28 @@ Output JSON:
388
  """
389
  response = model.generate_content(prompt)
390
  llm_output = response.text
391
- logger.info(f"Raw Gemini response: {llm_output}") # Debug: see what Gemini actually returns
392
-
393
  json_start = llm_output.find("{")
394
  json_end = llm_output.rfind("}") + 1
395
  json_str = llm_output[json_start:json_end]
396
-
397
- logger.info(f"Extracted JSON before fix: {json_str}") # Debug: see extracted JSON
398
-
399
- # CRITICAL: Fix scientific notation in the JSON string BEFORE parsing
400
  json_str = fix_scientific_notation_in_json(json_str)
401
- logger.info(f"Fixed JSON string: {json_str}") # Debug: see if regex worked
402
-
403
- # --- START OF MODIFICATIONS ---
404
- # 2. Use custom decimal parser instead of parse_float=Decimal
405
- # This ensures no scientific notation Decimals are created
406
  structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
407
-
408
- # CRITICAL: Multiple layers of conversion to ensure proper decimal format
409
  structured_data = convert_scientific_decimals(structured_data)
410
  structured_data = force_decimal_format(structured_data)
411
- # --- END OF MODIFICATIONS ---
412
-
413
  structured_data_cache[text_hash] = structured_data
414
  logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
415
- # Log the final converted data with proper string representation
416
- log_friendly_data = custom_encoder(structured_data)
 
417
  logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
 
418
  return structured_data
419
  except Exception as e:
420
  logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
@@ -484,7 +450,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
484
  if not raw_text.strip():
485
  try:
486
  convert_start_time = time.time()
487
- images = convert_from_bytes(file_bytes, dpi=150) # Use 150 dpi as a balance
488
  logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
489
 
490
  ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
@@ -537,14 +503,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
537
  output_data["success"] = False
538
 
539
  logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
540
-
541
- # --- START OF MODIFICATIONS ---
542
- # 3. Use the custom encoder when returning the final JSON response.
543
- # This ensures the Decimal objects are converted to strings correctly.
544
- # The default JSONResponse will not handle Decimal types properly.
545
-
546
- # Apply final decimal format enforcement before encoding
547
  output_data = force_decimal_format(output_data)
548
- encoded_data = custom_encoder(output_data)
549
- return JSONResponse(content=encoded_data)
550
- # --- END OF MODIFICATIONS ---
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
 
2
  from decimal import Decimal, InvalidOperation
3
  from fastapi.encoders import jsonable_encoder
4
  from starlette.responses import JSONResponse
 
26
  # --- START OF MODIFICATIONS ---
27
 
28
  # 1. Define a custom JSON encoder function
 
 
 
29
  def custom_encoder(obj: Any) -> Any:
30
  if isinstance(obj, Decimal):
31
+ try:
32
+ float_val = float(obj)
33
+ if float_val == 0:
34
+ return "0.0"
35
+ elif 0 < abs(float_val) < 1e-10:
36
+ result = f"{float_val:.20f}".rstrip('0').rstrip('.')
37
+ elif 0 < abs(float_val) < 1e-6:
38
+ result = f"{float_val:.15f}".rstrip('0').rstrip('.')
39
+ elif abs(float_val) < 1:
40
+ result = f"{float_val:.10f}".rstrip('0').rstrip('.')
41
+ else:
42
+ result = f"{float_val:.8f}".rstrip('0').rstrip('.')
43
+ # Ensure the result is a string to prevent JSON serialization issues
44
+ return str(result)
45
+ except (ValueError, OverflowError, InvalidOperation):
46
+ return str(obj) # Fallback to string representation
47
  return jsonable_encoder(obj)
48
 
49
  def custom_decimal_parser(s):
50
  """
51
  Custom parser that ensures numbers are converted to decimal format.
 
52
  """
53
  try:
 
54
  float_val = float(s)
 
55
  if float_val == 0:
56
+ return Decimal('0.0')
57
  elif 0 < abs(float_val) < 1e-10:
58
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
59
  elif 0 < abs(float_val) < 1e-6:
 
62
  formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
63
  else:
64
  formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
 
65
  return Decimal(formatted)
66
+ except (ValueError, InvalidOperation):
67
+ return Decimal(str(s))
68
 
69
  def fix_scientific_notation_in_json(json_str):
70
  """
71
  Fix scientific notation in JSON string before parsing.
 
72
  """
73
  def replace_scientific(match):
74
  try:
 
75
  scientific_num = match.group(0)
 
76
  float_val = float(scientific_num)
 
77
  if float_val == 0:
78
  return "0.0"
79
  elif 0 < abs(float_val) < 1e-10:
80
+ return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0.0"
81
  elif 0 < abs(float_val) < 1e-6:
82
+ return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
83
  elif abs(float_val) < 1:
84
+ return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
85
  else:
86
+ return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0.0"
87
  except Exception as e:
88
+ logger.error(f"Error converting {match.group(0)}: {e}")
89
+ return match.group(0)
90
+
 
91
  patterns = [
92
+ r'-?\d+\.?\d*[eE][+-]?\d+',
93
+ r'-?\d+[eE][+-]?\d+',
94
+ r'-?\d+\.\d+[eE][+-]?\d+',
95
  ]
 
96
  original_json = json_str
97
  for pattern in patterns:
98
  json_str = re.sub(pattern, replace_scientific, json_str)
99
+
 
 
100
  def replace_quoted_scientific(match):
101
+ full_match = match.group(0)
102
+ number_part = match.group(1)
103
  try:
104
  float_val = float(number_part)
105
  if 0 < abs(float_val) < 1e-6:
106
+ converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
107
  else:
108
+ converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
109
  return f'"{converted}"'
110
  except:
111
  return full_match
112
+
 
113
  quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
114
  json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
115
+
116
  if original_json != json_str:
117
+ logger.info(f"JSON transformation occurred")
118
+ logger.info(f"Original: {original_json[:200]}...")
119
+ logger.info(f"Fixed: {json_str[:200]}...")
120
+
121
  return json_str
122
 
123
  def convert_scientific_decimals(obj):
124
  """
125
+ Recursively convert Decimal objects to proper decimal notation.
 
 
126
  """
127
  if isinstance(obj, dict):
128
  return {k: convert_scientific_decimals(v) for k, v in obj.items()}
129
  elif isinstance(obj, list):
130
  return [convert_scientific_decimals(item) for item in obj]
131
  elif isinstance(obj, Decimal):
 
132
  try:
 
133
  float_val = float(obj)
 
 
134
  if float_val == 0:
135
+ return Decimal('0.0')
136
+ elif 0 < abs(float_val) < 1e-10:
 
 
137
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
138
  elif 0 < abs(float_val) < 1e-6:
139
  formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
140
+ elif abs(float_val) < 1:
141
  formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
142
  elif abs(float_val) < 1000000:
143
  formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
144
  else:
145
+ formatted = str(int(float_val)) if float_val == int(float_val) else f"{float_val:.2f}".rstrip('0').rstrip('.')
 
 
 
 
 
 
146
  if formatted == '0' and float_val != 0:
147
  formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
 
148
  return Decimal(formatted)
149
  except (ValueError, OverflowError, InvalidOperation):
 
150
  return obj
151
  else:
152
  return obj
153
 
154
  def force_decimal_format(data):
155
  """
156
+ Ensure all numeric values are in proper decimal format before JSON encoding.
 
157
  """
158
  if isinstance(data, dict):
159
  result = {}
160
  for key, value in data.items():
161
  if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
162
+ 'sub_total', 'tax_total', 'discount_total', 'total_amount', 'tax_rate']:
163
+ if isinstance(value, dict) and 'value' in value:
164
+ if isinstance(value['value'], (Decimal, float, int)):
165
+ try:
166
+ float_val = float(value['value'])
167
+ if float_val == 0:
168
+ decimal_str = "0.0"
169
+ elif 0 < abs(float_val) < 1e-10:
170
+ decimal_str = f"{float_val:.20f}".rstrip('0').rstrip('.')
171
+ elif 0 < abs(float_val) < 1e-6:
172
+ decimal_str = f"{float_val:.15f}".rstrip('0').rstrip('.')
173
+ else:
174
+ decimal_str = f"{float_val:.10f}".rstrip('0').rstrip('.')
175
+ result[key] = {'value': Decimal(decimal_str), 'accuracy': value['accuracy']}
176
+ except (ValueError, InvalidOperation):
177
+ result[key] = value
178
+ else:
179
+ result[key] = value
180
+ else:
181
+ result[key] = force_decimal_format(value)
182
+ else:
183
+ result[key] = force_decimal_format(value)
184
  return result
185
  elif isinstance(data, list):
186
  return [force_decimal_format(item) for item in data]
 
189
 
190
  # --- END OF MODIFICATIONS ---
191
 
 
192
  app = FastAPI()
193
 
194
  # Configure logging
 
204
  logger.error("GOOGLE_API_KEY not set")
205
  raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
206
  genai.configure(api_key=api_key)
207
+ model = genai.GenerativeModel("gemini-2.0-flash")
208
 
209
+ # Set Tesseract path
 
210
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
211
 
212
+ # In-memory caches
213
  raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
214
  structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
215
 
 
228
  return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
229
 
230
  async def process_image(img_bytes, filename, idx):
231
+ """Process a single image with OCR."""
232
  start_time = time.time()
233
  logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
234
  try:
 
270
  logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
271
  return structured_data_cache[text_hash]
272
 
273
+ if len(raw_text) > 20000:
274
  raw_text = raw_text[:20000]
275
  logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
276
 
 
284
  - The 'items' list may have multiple entries, each with detailed attributes.
285
  - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
286
  - Convert any date found in format: YYYY-MM-DD
 
287
  CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
288
  - CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
289
  - ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
 
291
  - For large numbers like 1500000, you MUST write out all the digits: 1500000
292
  - This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
293
  - Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
 
294
  Raw text:
295
  {raw_text}
296
 
 
359
  """
360
  response = model.generate_content(prompt)
361
  llm_output = response.text
362
+ logger.info(f"Raw Gemini response: {llm_output}")
363
+
364
  json_start = llm_output.find("{")
365
  json_end = llm_output.rfind("}") + 1
366
  json_str = llm_output[json_start:json_end]
367
+
368
+ logger.info(f"Extracted JSON before fix: {json_str}")
369
+
 
370
  json_str = fix_scientific_notation_in_json(json_str)
371
+ logger.info(f"Fixed JSON string: {json_str}")
372
+
 
 
 
373
  structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
 
 
374
  structured_data = convert_scientific_decimals(structured_data)
375
  structured_data = force_decimal_format(structured_data)
376
+
 
377
  structured_data_cache[text_hash] = structured_data
378
  logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
379
+
380
+ # Log structured data with custom encoder to avoid scientific notation in logs
381
+ log_friendly_data = json.dumps(structured_data, default=custom_encoder)
382
  logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
383
+
384
  return structured_data
385
  except Exception as e:
386
  logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
 
450
  if not raw_text.strip():
451
  try:
452
  convert_start_time = time.time()
453
+ images = convert_from_bytes(file_bytes, dpi=150)
454
  logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
455
 
456
  ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
 
503
  output_data["success"] = False
504
 
505
  logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
506
+
 
 
 
 
 
 
507
  output_data = force_decimal_format(output_data)
508
+ encoded_data = json.dumps(output_data, default=custom_encoder)
509
+ return JSONResponse(content=json.loads(encoded_data))