Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
-
# Import Decimal and the custom JSONResponse
|
| 3 |
from decimal import Decimal, InvalidOperation
|
| 4 |
from fastapi.encoders import jsonable_encoder
|
| 5 |
from starlette.responses import JSONResponse
|
|
@@ -27,39 +26,34 @@ from dotenv import load_dotenv
|
|
| 27 |
# --- START OF MODIFICATIONS ---
|
| 28 |
|
| 29 |
# 1. Define a custom JSON encoder function
|
| 30 |
-
# This function checks if an object is of type Decimal. If it is, it converts it
|
| 31 |
-
# to a string to preserve its exact formatting. Otherwise, it lets the default
|
| 32 |
-
# encoder handle it. This prevents FastAPI from converting Decimals back to floats.
|
| 33 |
def custom_encoder(obj: Any) -> Any:
|
| 34 |
if isinstance(obj, Decimal):
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 50 |
return jsonable_encoder(obj)
|
| 51 |
|
| 52 |
def custom_decimal_parser(s):
|
| 53 |
"""
|
| 54 |
Custom parser that ensures numbers are converted to decimal format.
|
| 55 |
-
This completely bypasses the scientific notation issue.
|
| 56 |
"""
|
| 57 |
try:
|
| 58 |
-
# Convert to float first to get the actual value
|
| 59 |
float_val = float(s)
|
| 60 |
-
|
| 61 |
if float_val == 0:
|
| 62 |
-
return Decimal('0')
|
| 63 |
elif 0 < abs(float_val) < 1e-10:
|
| 64 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 65 |
elif 0 < abs(float_val) < 1e-6:
|
|
@@ -68,144 +62,125 @@ def custom_decimal_parser(s):
|
|
| 68 |
formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 69 |
else:
|
| 70 |
formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
|
| 71 |
-
|
| 72 |
return Decimal(formatted)
|
| 73 |
-
except:
|
| 74 |
-
return Decimal(s)
|
| 75 |
|
| 76 |
def fix_scientific_notation_in_json(json_str):
|
| 77 |
"""
|
| 78 |
Fix scientific notation in JSON string before parsing.
|
| 79 |
-
This is the most aggressive approach - fix it at the string level.
|
| 80 |
"""
|
| 81 |
def replace_scientific(match):
|
| 82 |
try:
|
| 83 |
-
# Extract the full scientific notation number
|
| 84 |
scientific_num = match.group(0)
|
| 85 |
-
# Convert to float then to decimal format
|
| 86 |
float_val = float(scientific_num)
|
| 87 |
-
|
| 88 |
if float_val == 0:
|
| 89 |
return "0.0"
|
| 90 |
elif 0 < abs(float_val) < 1e-10:
|
| 91 |
-
return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0"
|
| 92 |
elif 0 < abs(float_val) < 1e-6:
|
| 93 |
-
return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
|
| 94 |
elif abs(float_val) < 1:
|
| 95 |
-
return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
|
| 96 |
else:
|
| 97 |
-
return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0"
|
| 98 |
except Exception as e:
|
| 99 |
-
|
| 100 |
-
return match.group(0)
|
| 101 |
-
|
| 102 |
-
# More comprehensive patterns to catch different scientific notation formats
|
| 103 |
patterns = [
|
| 104 |
-
r'-?\d+\.?\d*[eE][+-]?\d+',
|
| 105 |
-
r'-?\d+[eE][+-]?\d+',
|
| 106 |
-
r'-?\d+\.\d+[eE][+-]?\d+',
|
| 107 |
]
|
| 108 |
-
|
| 109 |
original_json = json_str
|
| 110 |
for pattern in patterns:
|
| 111 |
json_str = re.sub(pattern, replace_scientific, json_str)
|
| 112 |
-
|
| 113 |
-
# Also handle cases where scientific notation might be in quoted strings
|
| 114 |
-
# This catches cases like "value": "1.5e-7" (quoted scientific notation)
|
| 115 |
def replace_quoted_scientific(match):
|
| 116 |
-
full_match = match.group(0)
|
| 117 |
-
number_part = match.group(1)
|
| 118 |
try:
|
| 119 |
float_val = float(number_part)
|
| 120 |
if 0 < abs(float_val) < 1e-6:
|
| 121 |
-
converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
|
| 122 |
else:
|
| 123 |
-
converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
|
| 124 |
return f'"{converted}"'
|
| 125 |
except:
|
| 126 |
return full_match
|
| 127 |
-
|
| 128 |
-
# Pattern for quoted scientific notation
|
| 129 |
quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
|
| 130 |
json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
|
| 131 |
-
|
| 132 |
if original_json != json_str:
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
return json_str
|
| 138 |
|
| 139 |
def convert_scientific_decimals(obj):
|
| 140 |
"""
|
| 141 |
-
Recursively convert
|
| 142 |
-
This handles cases where json.loads(parse_float=Decimal) creates Decimal('9E-7')
|
| 143 |
-
instead of the desired Decimal('0.0000009').
|
| 144 |
"""
|
| 145 |
if isinstance(obj, dict):
|
| 146 |
return {k: convert_scientific_decimals(v) for k, v in obj.items()}
|
| 147 |
elif isinstance(obj, list):
|
| 148 |
return [convert_scientific_decimals(item) for item in obj]
|
| 149 |
elif isinstance(obj, Decimal):
|
| 150 |
-
# Always convert to proper decimal format, regardless of current format
|
| 151 |
try:
|
| 152 |
-
# Convert to float to get actual numeric value
|
| 153 |
float_val = float(obj)
|
| 154 |
-
|
| 155 |
-
# Handle special cases
|
| 156 |
if float_val == 0:
|
| 157 |
-
return Decimal('0')
|
| 158 |
-
|
| 159 |
-
# For very small positive numbers, use high precision
|
| 160 |
-
if 0 < abs(float_val) < 1e-10:
|
| 161 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 162 |
elif 0 < abs(float_val) < 1e-6:
|
| 163 |
formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
|
| 164 |
-
elif
|
| 165 |
formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 166 |
elif abs(float_val) < 1000000:
|
| 167 |
formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
|
| 168 |
else:
|
| 169 |
-
|
| 170 |
-
if float_val == int(float_val):
|
| 171 |
-
formatted = str(int(float_val))
|
| 172 |
-
else:
|
| 173 |
-
formatted = f"{float_val:.2f}".rstrip('0').rstrip('.')
|
| 174 |
-
|
| 175 |
-
# Ensure we have at least some decimal representation for very small numbers
|
| 176 |
if formatted == '0' and float_val != 0:
|
| 177 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 178 |
-
|
| 179 |
return Decimal(formatted)
|
| 180 |
except (ValueError, OverflowError, InvalidOperation):
|
| 181 |
-
# If conversion fails, return the original
|
| 182 |
return obj
|
| 183 |
else:
|
| 184 |
return obj
|
| 185 |
|
| 186 |
def force_decimal_format(data):
|
| 187 |
"""
|
| 188 |
-
|
| 189 |
-
This is applied right before JSON encoding.
|
| 190 |
"""
|
| 191 |
if isinstance(data, dict):
|
| 192 |
result = {}
|
| 193 |
for key, value in data.items():
|
| 194 |
if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
return result
|
| 210 |
elif isinstance(data, list):
|
| 211 |
return [force_decimal_format(item) for item in data]
|
|
@@ -214,7 +189,6 @@ def force_decimal_format(data):
|
|
| 214 |
|
| 215 |
# --- END OF MODIFICATIONS ---
|
| 216 |
|
| 217 |
-
|
| 218 |
app = FastAPI()
|
| 219 |
|
| 220 |
# Configure logging
|
|
@@ -230,13 +204,12 @@ if not api_key:
|
|
| 230 |
logger.error("GOOGLE_API_KEY not set")
|
| 231 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
|
| 232 |
genai.configure(api_key=api_key)
|
| 233 |
-
model = genai.GenerativeModel("gemini-2.0-flash")
|
| 234 |
|
| 235 |
-
# Set Tesseract path
|
| 236 |
-
# For Docker/Linux, this is often the correct path. For Windows/macOS, it will differ.
|
| 237 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 238 |
|
| 239 |
-
# In-memory caches
|
| 240 |
raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 241 |
structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 242 |
|
|
@@ -255,7 +228,7 @@ def get_text_hash(raw_text):
|
|
| 255 |
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
|
| 256 |
|
| 257 |
async def process_image(img_bytes, filename, idx):
|
| 258 |
-
"""Process a single image
|
| 259 |
start_time = time.time()
|
| 260 |
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
|
| 261 |
try:
|
|
@@ -297,7 +270,7 @@ async def process_with_gemini(filename: str, raw_text: str):
|
|
| 297 |
logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
|
| 298 |
return structured_data_cache[text_hash]
|
| 299 |
|
| 300 |
-
if len(raw_text) > 20000:
|
| 301 |
raw_text = raw_text[:20000]
|
| 302 |
logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
|
| 303 |
|
|
@@ -311,7 +284,6 @@ async def process_with_gemini(filename: str, raw_text: str):
|
|
| 311 |
- The 'items' list may have multiple entries, each with detailed attributes.
|
| 312 |
- If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
|
| 313 |
- Convert any date found in format: YYYY-MM-DD
|
| 314 |
-
|
| 315 |
CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
|
| 316 |
- CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
|
| 317 |
- ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
|
|
@@ -319,7 +291,6 @@ CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use sc
|
|
| 319 |
- For large numbers like 1500000, you MUST write out all the digits: 1500000
|
| 320 |
- This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
|
| 321 |
- Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
|
| 322 |
-
|
| 323 |
Raw text:
|
| 324 |
{raw_text}
|
| 325 |
|
|
@@ -388,33 +359,28 @@ Output JSON:
|
|
| 388 |
"""
|
| 389 |
response = model.generate_content(prompt)
|
| 390 |
llm_output = response.text
|
| 391 |
-
logger.info(f"Raw Gemini response: {llm_output}")
|
| 392 |
-
|
| 393 |
json_start = llm_output.find("{")
|
| 394 |
json_end = llm_output.rfind("}") + 1
|
| 395 |
json_str = llm_output[json_start:json_end]
|
| 396 |
-
|
| 397 |
-
logger.info(f"Extracted JSON before fix: {json_str}")
|
| 398 |
-
|
| 399 |
-
# CRITICAL: Fix scientific notation in the JSON string BEFORE parsing
|
| 400 |
json_str = fix_scientific_notation_in_json(json_str)
|
| 401 |
-
logger.info(f"Fixed JSON string: {json_str}")
|
| 402 |
-
|
| 403 |
-
# --- START OF MODIFICATIONS ---
|
| 404 |
-
# 2. Use custom decimal parser instead of parse_float=Decimal
|
| 405 |
-
# This ensures no scientific notation Decimals are created
|
| 406 |
structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
|
| 407 |
-
|
| 408 |
-
# CRITICAL: Multiple layers of conversion to ensure proper decimal format
|
| 409 |
structured_data = convert_scientific_decimals(structured_data)
|
| 410 |
structured_data = force_decimal_format(structured_data)
|
| 411 |
-
|
| 412 |
-
|
| 413 |
structured_data_cache[text_hash] = structured_data
|
| 414 |
logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 415 |
-
|
| 416 |
-
|
|
|
|
| 417 |
logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
|
|
|
|
| 418 |
return structured_data
|
| 419 |
except Exception as e:
|
| 420 |
logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
|
|
@@ -484,7 +450,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
|
|
| 484 |
if not raw_text.strip():
|
| 485 |
try:
|
| 486 |
convert_start_time = time.time()
|
| 487 |
-
images = convert_from_bytes(file_bytes, dpi=150)
|
| 488 |
logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
|
| 489 |
|
| 490 |
ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
|
|
@@ -537,14 +503,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
|
|
| 537 |
output_data["success"] = False
|
| 538 |
|
| 539 |
logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
|
| 540 |
-
|
| 541 |
-
# --- START OF MODIFICATIONS ---
|
| 542 |
-
# 3. Use the custom encoder when returning the final JSON response.
|
| 543 |
-
# This ensures the Decimal objects are converted to strings correctly.
|
| 544 |
-
# The default JSONResponse will not handle Decimal types properly.
|
| 545 |
-
|
| 546 |
-
# Apply final decimal format enforcement before encoding
|
| 547 |
output_data = force_decimal_format(output_data)
|
| 548 |
-
encoded_data =
|
| 549 |
-
return JSONResponse(content=encoded_data)
|
| 550 |
-
# --- END OF MODIFICATIONS ---
|
|
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
|
|
|
| 2 |
from decimal import Decimal, InvalidOperation
|
| 3 |
from fastapi.encoders import jsonable_encoder
|
| 4 |
from starlette.responses import JSONResponse
|
|
|
|
| 26 |
# --- START OF MODIFICATIONS ---
|
| 27 |
|
| 28 |
# 1. Define a custom JSON encoder function
|
|
|
|
|
|
|
|
|
|
| 29 |
def custom_encoder(obj: Any) -> Any:
|
| 30 |
if isinstance(obj, Decimal):
|
| 31 |
+
try:
|
| 32 |
+
float_val = float(obj)
|
| 33 |
+
if float_val == 0:
|
| 34 |
+
return "0.0"
|
| 35 |
+
elif 0 < abs(float_val) < 1e-10:
|
| 36 |
+
result = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 37 |
+
elif 0 < abs(float_val) < 1e-6:
|
| 38 |
+
result = f"{float_val:.15f}".rstrip('0').rstrip('.')
|
| 39 |
+
elif abs(float_val) < 1:
|
| 40 |
+
result = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 41 |
+
else:
|
| 42 |
+
result = f"{float_val:.8f}".rstrip('0').rstrip('.')
|
| 43 |
+
# Ensure the result is a string to prevent JSON serialization issues
|
| 44 |
+
return str(result)
|
| 45 |
+
except (ValueError, OverflowError, InvalidOperation):
|
| 46 |
+
return str(obj) # Fallback to string representation
|
| 47 |
return jsonable_encoder(obj)
|
| 48 |
|
| 49 |
def custom_decimal_parser(s):
|
| 50 |
"""
|
| 51 |
Custom parser that ensures numbers are converted to decimal format.
|
|
|
|
| 52 |
"""
|
| 53 |
try:
|
|
|
|
| 54 |
float_val = float(s)
|
|
|
|
| 55 |
if float_val == 0:
|
| 56 |
+
return Decimal('0.0')
|
| 57 |
elif 0 < abs(float_val) < 1e-10:
|
| 58 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 59 |
elif 0 < abs(float_val) < 1e-6:
|
|
|
|
| 62 |
formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 63 |
else:
|
| 64 |
formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
|
|
|
|
| 65 |
return Decimal(formatted)
|
| 66 |
+
except (ValueError, InvalidOperation):
|
| 67 |
+
return Decimal(str(s))
|
| 68 |
|
| 69 |
def fix_scientific_notation_in_json(json_str):
|
| 70 |
"""
|
| 71 |
Fix scientific notation in JSON string before parsing.
|
|
|
|
| 72 |
"""
|
| 73 |
def replace_scientific(match):
|
| 74 |
try:
|
|
|
|
| 75 |
scientific_num = match.group(0)
|
|
|
|
| 76 |
float_val = float(scientific_num)
|
|
|
|
| 77 |
if float_val == 0:
|
| 78 |
return "0.0"
|
| 79 |
elif 0 < abs(float_val) < 1e-10:
|
| 80 |
+
return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0.0"
|
| 81 |
elif 0 < abs(float_val) < 1e-6:
|
| 82 |
+
return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
|
| 83 |
elif abs(float_val) < 1:
|
| 84 |
+
return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
|
| 85 |
else:
|
| 86 |
+
return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0.0"
|
| 87 |
except Exception as e:
|
| 88 |
+
logger.error(f"Error converting {match.group(0)}: {e}")
|
| 89 |
+
return match.group(0)
|
| 90 |
+
|
|
|
|
| 91 |
patterns = [
|
| 92 |
+
r'-?\d+\.?\d*[eE][+-]?\d+',
|
| 93 |
+
r'-?\d+[eE][+-]?\d+',
|
| 94 |
+
r'-?\d+\.\d+[eE][+-]?\d+',
|
| 95 |
]
|
|
|
|
| 96 |
original_json = json_str
|
| 97 |
for pattern in patterns:
|
| 98 |
json_str = re.sub(pattern, replace_scientific, json_str)
|
| 99 |
+
|
|
|
|
|
|
|
| 100 |
def replace_quoted_scientific(match):
|
| 101 |
+
full_match = match.group(0)
|
| 102 |
+
number_part = match.group(1)
|
| 103 |
try:
|
| 104 |
float_val = float(number_part)
|
| 105 |
if 0 < abs(float_val) < 1e-6:
|
| 106 |
+
converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
|
| 107 |
else:
|
| 108 |
+
converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
|
| 109 |
return f'"{converted}"'
|
| 110 |
except:
|
| 111 |
return full_match
|
| 112 |
+
|
|
|
|
| 113 |
quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
|
| 114 |
json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
|
| 115 |
+
|
| 116 |
if original_json != json_str:
|
| 117 |
+
logger.info(f"JSON transformation occurred")
|
| 118 |
+
logger.info(f"Original: {original_json[:200]}...")
|
| 119 |
+
logger.info(f"Fixed: {json_str[:200]}...")
|
| 120 |
+
|
| 121 |
return json_str
|
| 122 |
|
| 123 |
def convert_scientific_decimals(obj):
|
| 124 |
"""
|
| 125 |
+
Recursively convert Decimal objects to proper decimal notation.
|
|
|
|
|
|
|
| 126 |
"""
|
| 127 |
if isinstance(obj, dict):
|
| 128 |
return {k: convert_scientific_decimals(v) for k, v in obj.items()}
|
| 129 |
elif isinstance(obj, list):
|
| 130 |
return [convert_scientific_decimals(item) for item in obj]
|
| 131 |
elif isinstance(obj, Decimal):
|
|
|
|
| 132 |
try:
|
|
|
|
| 133 |
float_val = float(obj)
|
|
|
|
|
|
|
| 134 |
if float_val == 0:
|
| 135 |
+
return Decimal('0.0')
|
| 136 |
+
elif 0 < abs(float_val) < 1e-10:
|
|
|
|
|
|
|
| 137 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 138 |
elif 0 < abs(float_val) < 1e-6:
|
| 139 |
formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
|
| 140 |
+
elif abs(float_val) < 1:
|
| 141 |
formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 142 |
elif abs(float_val) < 1000000:
|
| 143 |
formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
|
| 144 |
else:
|
| 145 |
+
formatted = str(int(float_val)) if float_val == int(float_val) else f"{float_val:.2f}".rstrip('0').rstrip('.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
if formatted == '0' and float_val != 0:
|
| 147 |
formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
|
|
|
| 148 |
return Decimal(formatted)
|
| 149 |
except (ValueError, OverflowError, InvalidOperation):
|
|
|
|
| 150 |
return obj
|
| 151 |
else:
|
| 152 |
return obj
|
| 153 |
|
| 154 |
def force_decimal_format(data):
|
| 155 |
"""
|
| 156 |
+
Ensure all numeric values are in proper decimal format before JSON encoding.
|
|
|
|
| 157 |
"""
|
| 158 |
if isinstance(data, dict):
|
| 159 |
result = {}
|
| 160 |
for key, value in data.items():
|
| 161 |
if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
|
| 162 |
+
'sub_total', 'tax_total', 'discount_total', 'total_amount', 'tax_rate']:
|
| 163 |
+
if isinstance(value, dict) and 'value' in value:
|
| 164 |
+
if isinstance(value['value'], (Decimal, float, int)):
|
| 165 |
+
try:
|
| 166 |
+
float_val = float(value['value'])
|
| 167 |
+
if float_val == 0:
|
| 168 |
+
decimal_str = "0.0"
|
| 169 |
+
elif 0 < abs(float_val) < 1e-10:
|
| 170 |
+
decimal_str = f"{float_val:.20f}".rstrip('0').rstrip('.')
|
| 171 |
+
elif 0 < abs(float_val) < 1e-6:
|
| 172 |
+
decimal_str = f"{float_val:.15f}".rstrip('0').rstrip('.')
|
| 173 |
+
else:
|
| 174 |
+
decimal_str = f"{float_val:.10f}".rstrip('0').rstrip('.')
|
| 175 |
+
result[key] = {'value': Decimal(decimal_str), 'accuracy': value['accuracy']}
|
| 176 |
+
except (ValueError, InvalidOperation):
|
| 177 |
+
result[key] = value
|
| 178 |
+
else:
|
| 179 |
+
result[key] = value
|
| 180 |
+
else:
|
| 181 |
+
result[key] = force_decimal_format(value)
|
| 182 |
+
else:
|
| 183 |
+
result[key] = force_decimal_format(value)
|
| 184 |
return result
|
| 185 |
elif isinstance(data, list):
|
| 186 |
return [force_decimal_format(item) for item in data]
|
|
|
|
| 189 |
|
| 190 |
# --- END OF MODIFICATIONS ---
|
| 191 |
|
|
|
|
| 192 |
app = FastAPI()
|
| 193 |
|
| 194 |
# Configure logging
|
|
|
|
| 204 |
logger.error("GOOGLE_API_KEY not set")
|
| 205 |
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
|
| 206 |
genai.configure(api_key=api_key)
|
| 207 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
| 208 |
|
| 209 |
+
# Set Tesseract path
|
|
|
|
| 210 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 211 |
|
| 212 |
+
# In-memory caches
|
| 213 |
raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 214 |
structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
|
| 215 |
|
|
|
|
| 228 |
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
|
| 229 |
|
| 230 |
async def process_image(img_bytes, filename, idx):
|
| 231 |
+
"""Process a single image with OCR."""
|
| 232 |
start_time = time.time()
|
| 233 |
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
|
| 234 |
try:
|
|
|
|
| 270 |
logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
|
| 271 |
return structured_data_cache[text_hash]
|
| 272 |
|
| 273 |
+
if len(raw_text) > 20000:
|
| 274 |
raw_text = raw_text[:20000]
|
| 275 |
logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
|
| 276 |
|
|
|
|
| 284 |
- The 'items' list may have multiple entries, each with detailed attributes.
|
| 285 |
- If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
|
| 286 |
- Convert any date found in format: YYYY-MM-DD
|
|
|
|
| 287 |
CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
|
| 288 |
- CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
|
| 289 |
- ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
|
|
|
|
| 291 |
- For large numbers like 1500000, you MUST write out all the digits: 1500000
|
| 292 |
- This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
|
| 293 |
- Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
|
|
|
|
| 294 |
Raw text:
|
| 295 |
{raw_text}
|
| 296 |
|
|
|
|
| 359 |
"""
|
| 360 |
response = model.generate_content(prompt)
|
| 361 |
llm_output = response.text
|
| 362 |
+
logger.info(f"Raw Gemini response: {llm_output}")
|
| 363 |
+
|
| 364 |
json_start = llm_output.find("{")
|
| 365 |
json_end = llm_output.rfind("}") + 1
|
| 366 |
json_str = llm_output[json_start:json_end]
|
| 367 |
+
|
| 368 |
+
logger.info(f"Extracted JSON before fix: {json_str}")
|
| 369 |
+
|
|
|
|
| 370 |
json_str = fix_scientific_notation_in_json(json_str)
|
| 371 |
+
logger.info(f"Fixed JSON string: {json_str}")
|
| 372 |
+
|
|
|
|
|
|
|
|
|
|
| 373 |
structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
|
|
|
|
|
|
|
| 374 |
structured_data = convert_scientific_decimals(structured_data)
|
| 375 |
structured_data = force_decimal_format(structured_data)
|
| 376 |
+
|
|
|
|
| 377 |
structured_data_cache[text_hash] = structured_data
|
| 378 |
logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
|
| 379 |
+
|
| 380 |
+
# Log structured data with custom encoder to avoid scientific notation in logs
|
| 381 |
+
log_friendly_data = json.dumps(structured_data, default=custom_encoder)
|
| 382 |
logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
|
| 383 |
+
|
| 384 |
return structured_data
|
| 385 |
except Exception as e:
|
| 386 |
logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
|
|
|
|
| 450 |
if not raw_text.strip():
|
| 451 |
try:
|
| 452 |
convert_start_time = time.time()
|
| 453 |
+
images = convert_from_bytes(file_bytes, dpi=150)
|
| 454 |
logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
|
| 455 |
|
| 456 |
ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
|
|
|
|
| 503 |
output_data["success"] = False
|
| 504 |
|
| 505 |
logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
|
| 506 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
output_data = force_decimal_format(output_data)
|
| 508 |
+
encoded_data = json.dumps(output_data, default=custom_encoder)
|
| 509 |
+
return JSONResponse(content=json.loads(encoded_data))
|
|
|