Spaces:

satvaSolutions
/

pdf-ocr

Sleeping

App Files Files Community

ChintanSatva commited on Jul 29, 2025

Commit

fa77ec8

verified ·

1 Parent(s): a58bf1e

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -130

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
-# Import Decimal and the custom JSONResponse
 from decimal import Decimal, InvalidOperation
 from fastapi.encoders import jsonable_encoder
 from starlette.responses import JSONResponse
@@ -27,39 +26,34 @@ from dotenv import load_dotenv
 # --- START OF MODIFICATIONS ---
 # 1. Define a custom JSON encoder function
-# This function checks if an object is of type Decimal. If it is, it converts it
-# to a string to preserve its exact formatting. Otherwise, it lets the default
-# encoder handle it. This prevents FastAPI from converting Decimals back to floats.
 def custom_encoder(obj: Any) -> Any:
     if isinstance(obj, Decimal):
-        result = str(obj)
-        # Final safety check: if somehow scientific notation still exists, fix it
-        if 'E' in result or 'e' in result:
-            try:
-                float_val = float(obj)
-                if abs(float_val) == 0:
-                    result = "0"
-                elif 0 < abs(float_val) < 1e-6:
-                    result = f"{float_val:.15f}".rstrip('0').rstrip('.')
-                else:
-                    result = f"{float_val:.10f}".rstrip('0').rstrip('.')
-            except:
-                pass
-        return result
-    # For any other type, fall back to the default encoder
     return jsonable_encoder(obj)
 def custom_decimal_parser(s):
     """
     Custom parser that ensures numbers are converted to decimal format.
-    This completely bypasses the scientific notation issue.
     """
     try:
-        # Convert to float first to get the actual value
         float_val = float(s)
         if float_val == 0:
-            return Decimal('0')
         elif 0 < abs(float_val) < 1e-10:
             formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
         elif 0 < abs(float_val) < 1e-6:
@@ -68,144 +62,125 @@ def custom_decimal_parser(s):
             formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
         else:
             formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
         return Decimal(formatted)
-    except:
-        return Decimal(s)
 def fix_scientific_notation_in_json(json_str):
     """
     Fix scientific notation in JSON string before parsing.
-    This is the most aggressive approach - fix it at the string level.
     """
     def replace_scientific(match):
         try:
-            # Extract the full scientific notation number
             scientific_num = match.group(0)
-            # Convert to float then to decimal format
             float_val = float(scientific_num)
             if float_val == 0:
                 return "0.0"
             elif 0 < abs(float_val) < 1e-10:
-                return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0"
             elif 0 < abs(float_val) < 1e-6:
-                return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
             elif abs(float_val) < 1:
-                return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
             else:
-                return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0"
         except Exception as e:
-            print(f"Error converting {match.group(0)}: {e}")
-            return match.group(0)  # Return original if conversion fails
-    # More comprehensive patterns to catch different scientific notation formats
     patterns = [
-        r'-?\d+\.?\d*[eE][+-]?\d+',  # Standard: 1.5e-7, 9E-7
-        r'-?\d+[eE][+-]?\d+',        # Without decimal: 9e-7
-        r'-?\d+\.\d+[eE][+-]?\d+',   # With decimal: 1.5e-7
     ]
     original_json = json_str
     for pattern in patterns:
         json_str = re.sub(pattern, replace_scientific, json_str)
-    # Also handle cases where scientific notation might be in quoted strings
-    # This catches cases like "value": "1.5e-7" (quoted scientific notation)
     def replace_quoted_scientific(match):
-        full_match = match.group(0)  # "1.5e-7"
-        number_part = match.group(1)  # 1.5e-7
         try:
             float_val = float(number_part)
             if 0 < abs(float_val) < 1e-6:
-                converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0"
             else:
-                converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0"
             return f'"{converted}"'
         except:
             return full_match
-    # Pattern for quoted scientific notation
     quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
     json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
     if original_json != json_str:
-        print(f"JSON transformation occurred")
-        print(f"Original: {original_json[:200]}...")
-        print(f"Fixed: {json_str[:200]}...")
     return json_str
 def convert_scientific_decimals(obj):
     """
-    Recursively convert any Decimal objects in scientific notation to decimal notation.
-    This handles cases where json.loads(parse_float=Decimal) creates Decimal('9E-7')
-    instead of the desired Decimal('0.0000009').
     """
     if isinstance(obj, dict):
         return {k: convert_scientific_decimals(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_scientific_decimals(item) for item in obj]
     elif isinstance(obj, Decimal):
-        # Always convert to proper decimal format, regardless of current format
         try:
-            # Convert to float to get actual numeric value
             float_val = float(obj)
-            # Handle special cases
             if float_val == 0:
-                return Decimal('0')
-            # For very small positive numbers, use high precision
-            if 0 < abs(float_val) < 1e-10:
                 formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
             elif 0 < abs(float_val) < 1e-6:
                 formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
-            elif 0 < abs(float_val) < 1:
                 formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
             elif abs(float_val) < 1000000:
                 formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
             else:
-                # For large numbers, don't use decimal places if they're all zeros
-                if float_val == int(float_val):
-                    formatted = str(int(float_val))
-                else:
-                    formatted = f"{float_val:.2f}".rstrip('0').rstrip('.')
-            # Ensure we have at least some decimal representation for very small numbers
             if formatted == '0' and float_val != 0:
                 formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
             return Decimal(formatted)
         except (ValueError, OverflowError, InvalidOperation):
-            # If conversion fails, return the original
             return obj
     else:
         return obj
 def force_decimal_format(data):
     """
-    Additional layer to ensure all numeric values are in proper decimal format.
-    This is applied right before JSON encoding.
     """
     if isinstance(data, dict):
         result = {}
         for key, value in data.items():
             if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
-                      'sub_total', 'tax_total', 'discount_total', 'total_amount', 'tax_rate'] and isinstance(value, dict) and 'value' in value:
-                # Special handling for numeric fields
-                if isinstance(value['value'], (Decimal, float, int)) and value['value'] != 0:
-                    try:
-                        float_val = float(value['value'])
-                        if 0 < abs(float_val) < 1e-6:
-                            # Very small numbers - use high precision
-                            decimal_str = f"{float_val:.15f}".rstrip('0').rstrip('.')
-                        else:
-                            decimal_str = f"{float_val:.10f}".rstrip('0').rstrip('.')
-                        value['value'] = Decimal(decimal_str)
-                    except:
-                        pass
-            result[key] = force_decimal_format(value)
         return result
     elif isinstance(data, list):
         return [force_decimal_format(item) for item in data]
@@ -214,7 +189,6 @@ def force_decimal_format(data):
 # --- END OF MODIFICATIONS ---
 app = FastAPI()
 # Configure logging
@@ -230,13 +204,12 @@ if not api_key:
     logger.error("GOOGLE_API_KEY not set")
     raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
 genai.configure(api_key=api_key)
-model = genai.GenerativeModel("gemini-2.0-flash") # Using a recommended model
-# Set Tesseract path (adjust if necessary)
-# For Docker/Linux, this is often the correct path. For Windows/macOS, it will differ.
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-# In-memory caches (1-hour TTL)
 raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
 structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
@@ -255,7 +228,7 @@ def get_text_hash(raw_text):
     return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
 async def process_image(img_bytes, filename, idx):
-    """Process a single image (JPG/JPEG/PNG) with OCR."""
     start_time = time.time()
     logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
     try:
@@ -297,7 +270,7 @@ async def process_with_gemini(filename: str, raw_text: str):
         logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
         return structured_data_cache[text_hash]
-    if len(raw_text) > 20000: # Increased limit slightly
         raw_text = raw_text[:20000]
         logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
@@ -311,7 +284,6 @@ async def process_with_gemini(filename: str, raw_text: str):
 - The 'items' list may have multiple entries, each with detailed attributes.
 - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
 - Convert any date found in format: YYYY-MM-DD
 CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
 - CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
 - ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
@@ -319,7 +291,6 @@ CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use sc
 - For large numbers like 1500000, you MUST write out all the digits: 1500000
 - This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
 - Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
 Raw text:
 {raw_text}
@@ -388,33 +359,28 @@ Output JSON:
         """
         response = model.generate_content(prompt)
         llm_output = response.text
-        logger.info(f"Raw Gemini response: {llm_output}")  # Debug: see what Gemini actually returns
         json_start = llm_output.find("{")
         json_end = llm_output.rfind("}") + 1
         json_str = llm_output[json_start:json_end]
-        logger.info(f"Extracted JSON before fix: {json_str}")  # Debug: see extracted JSON
-        # CRITICAL: Fix scientific notation in the JSON string BEFORE parsing
         json_str = fix_scientific_notation_in_json(json_str)
-        logger.info(f"Fixed JSON string: {json_str}")  # Debug: see if regex worked
-        # --- START OF MODIFICATIONS ---
-        # 2. Use custom decimal parser instead of parse_float=Decimal
-        # This ensures no scientific notation Decimals are created
         structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
-        # CRITICAL: Multiple layers of conversion to ensure proper decimal format
         structured_data = convert_scientific_decimals(structured_data)
         structured_data = force_decimal_format(structured_data)
-        # --- END OF MODIFICATIONS ---
         structured_data_cache[text_hash] = structured_data
         logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
-        # Log the final converted data with proper string representation
-        log_friendly_data = custom_encoder(structured_data)
         logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
         return structured_data
     except Exception as e:
         logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
@@ -484,7 +450,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
-                        images = convert_from_bytes(file_bytes, dpi=150) # Use 150 dpi as a balance
                         logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
                         ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
@@ -537,14 +503,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
         output_data["success"] = False
     logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
-    # --- START OF MODIFICATIONS ---
-    # 3. Use the custom encoder when returning the final JSON response.
-    # This ensures the Decimal objects are converted to strings correctly.
-    # The default JSONResponse will not handle Decimal types properly.
-    # Apply final decimal format enforcement before encoding
     output_data = force_decimal_format(output_data)
-    encoded_data = custom_encoder(output_data)
-    return JSONResponse(content=encoded_data)
-    # --- END OF MODIFICATIONS ---

 from fastapi import FastAPI, File, UploadFile, HTTPException
 from decimal import Decimal, InvalidOperation
 from fastapi.encoders import jsonable_encoder
 from starlette.responses import JSONResponse
 # --- START OF MODIFICATIONS ---
 # 1. Define a custom JSON encoder function
 def custom_encoder(obj: Any) -> Any:
     if isinstance(obj, Decimal):
+        try:
+            float_val = float(obj)
+            if float_val == 0:
+                return "0.0"
+            elif 0 < abs(float_val) < 1e-10:
+                result = f"{float_val:.20f}".rstrip('0').rstrip('.')
+            elif 0 < abs(float_val) < 1e-6:
+                result = f"{float_val:.15f}".rstrip('0').rstrip('.')
+            elif abs(float_val) < 1:
+                result = f"{float_val:.10f}".rstrip('0').rstrip('.')
+            else:
+                result = f"{float_val:.8f}".rstrip('0').rstrip('.')
+            # Ensure the result is a string to prevent JSON serialization issues
+            return str(result)
+        except (ValueError, OverflowError, InvalidOperation):
+            return str(obj)  # Fallback to string representation
     return jsonable_encoder(obj)
 def custom_decimal_parser(s):
     """
     Custom parser that ensures numbers are converted to decimal format.
     """
     try:
         float_val = float(s)
         if float_val == 0:
+            return Decimal('0.0')
         elif 0 < abs(float_val) < 1e-10:
             formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
         elif 0 < abs(float_val) < 1e-6:
             formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
         else:
             formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
         return Decimal(formatted)
+    except (ValueError, InvalidOperation):
+        return Decimal(str(s))
 def fix_scientific_notation_in_json(json_str):
     """
     Fix scientific notation in JSON string before parsing.
     """
     def replace_scientific(match):
         try:
             scientific_num = match.group(0)
             float_val = float(scientific_num)
             if float_val == 0:
                 return "0.0"
             elif 0 < abs(float_val) < 1e-10:
+                return f"{float_val:.20f}".rstrip('0').rstrip('.') or "0.0"
             elif 0 < abs(float_val) < 1e-6:
+                return f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
             elif abs(float_val) < 1:
+                return f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
             else:
+                return f"{float_val:.8f}".rstrip('0').rstrip('.') or "0.0"
         except Exception as e:
+            logger.error(f"Error converting {match.group(0)}: {e}")
+            return match.group(0)
     patterns = [
+        r'-?\d+\.?\d*[eE][+-]?\d+',
+        r'-?\d+[eE][+-]?\d+',
+        r'-?\d+\.\d+[eE][+-]?\d+',
     ]
     original_json = json_str
     for pattern in patterns:
         json_str = re.sub(pattern, replace_scientific, json_str)
     def replace_quoted_scientific(match):
+        full_match = match.group(0)
+        number_part = match.group(1)
         try:
             float_val = float(number_part)
             if 0 < abs(float_val) < 1e-6:
+                converted = f"{float_val:.15f}".rstrip('0').rstrip('.') or "0.0"
             else:
+                converted = f"{float_val:.10f}".rstrip('0').rstrip('.') or "0.0"
             return f'"{converted}"'
         except:
             return full_match
     quoted_pattern = r'"(-?\d+\.?\d*[eE][+-]?\d+)"'
     json_str = re.sub(quoted_pattern, replace_quoted_scientific, json_str)
     if original_json != json_str:
+        logger.info(f"JSON transformation occurred")
+        logger.info(f"Original: {original_json[:200]}...")
+        logger.info(f"Fixed: {json_str[:200]}...")
     return json_str
 def convert_scientific_decimals(obj):
     """
+    Recursively convert Decimal objects to proper decimal notation.
     """
     if isinstance(obj, dict):
         return {k: convert_scientific_decimals(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_scientific_decimals(item) for item in obj]
     elif isinstance(obj, Decimal):
         try:
             float_val = float(obj)
             if float_val == 0:
+                return Decimal('0.0')
+            elif 0 < abs(float_val) < 1e-10:
                 formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
             elif 0 < abs(float_val) < 1e-6:
                 formatted = f"{float_val:.15f}".rstrip('0').rstrip('.')
+            elif abs(float_val) < 1:
                 formatted = f"{float_val:.10f}".rstrip('0').rstrip('.')
             elif abs(float_val) < 1000000:
                 formatted = f"{float_val:.8f}".rstrip('0').rstrip('.')
             else:
+                formatted = str(int(float_val)) if float_val == int(float_val) else f"{float_val:.2f}".rstrip('0').rstrip('.')
             if formatted == '0' and float_val != 0:
                 formatted = f"{float_val:.20f}".rstrip('0').rstrip('.')
             return Decimal(formatted)
         except (ValueError, OverflowError, InvalidOperation):
             return obj
     else:
         return obj
 def force_decimal_format(data):
     """
+    Ensure all numeric values are in proper decimal format before JSON encoding.
     """
     if isinstance(data, dict):
         result = {}
         for key, value in data.items():
             if key in ['unit_price', 'total_price', 'tax_amount', 'discount', 'net_amount',
+                       'sub_total', 'tax_total', 'discount_total', 'total_amount', 'tax_rate']:
+                if isinstance(value, dict) and 'value' in value:
+                    if isinstance(value['value'], (Decimal, float, int)):
+                        try:
+                            float_val = float(value['value'])
+                            if float_val == 0:
+                                decimal_str = "0.0"
+                            elif 0 < abs(float_val) < 1e-10:
+                                decimal_str = f"{float_val:.20f}".rstrip('0').rstrip('.')
+                            elif 0 < abs(float_val) < 1e-6:
+                                decimal_str = f"{float_val:.15f}".rstrip('0').rstrip('.')
+                            else:
+                                decimal_str = f"{float_val:.10f}".rstrip('0').rstrip('.')
+                            result[key] = {'value': Decimal(decimal_str), 'accuracy': value['accuracy']}
+                        except (ValueError, InvalidOperation):
+                            result[key] = value
+                    else:
+                        result[key] = value
+                else:
+                    result[key] = force_decimal_format(value)
+            else:
+                result[key] = force_decimal_format(value)
         return result
     elif isinstance(data, list):
         return [force_decimal_format(item) for item in data]
 # --- END OF MODIFICATIONS ---
 app = FastAPI()
 # Configure logging
     logger.error("GOOGLE_API_KEY not set")
     raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
 genai.configure(api_key=api_key)
+model = genai.GenerativeModel("gemini-2.0-flash")
+# Set Tesseract path
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+# In-memory caches
 raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
 structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
     return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
 async def process_image(img_bytes, filename, idx):
+    """Process a single image with OCR."""
     start_time = time.time()
     logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
     try:
         logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
         return structured_data_cache[text_hash]
+    if len(raw_text) > 20000:
         raw_text = raw_text[:20000]
         logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
 - The 'items' list may have multiple entries, each with detailed attributes.
 - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
 - Convert any date found in format: YYYY-MM-DD
 CRITICAL: ALL numeric values must be in full decimal notation. NEVER EVER use scientific notation or exponential format:
 - CORRECT: 0.0000009, 0.00000015, 0.0000002, 1500000, 0.00123
 - ABSOLUTELY FORBIDDEN: 9e-7, 9E-7, 1.5e-7, 1.5E-7, 2e-7, 2E-7, 1.5e+6, 1.23e-3, any number with 'e' or 'E'
 - For large numbers like 1500000, you MUST write out all the digits: 1500000
 - This is MANDATORY for: unit_price, total_price, tax_amount, discount, net_amount, sub_total, tax_total, discount_total, total_amount
 - Example: if unit price is 9 * 10^-7, write it as 0.0000009, NOT 9e-7 or 9E-7
 Raw text:
 {raw_text}
         """
         response = model.generate_content(prompt)
         llm_output = response.text
+        logger.info(f"Raw Gemini response: {llm_output}")
         json_start = llm_output.find("{")
         json_end = llm_output.rfind("}") + 1
         json_str = llm_output[json_start:json_end]
+        logger.info(f"Extracted JSON before fix: {json_str}")
         json_str = fix_scientific_notation_in_json(json_str)
+        logger.info(f"Fixed JSON string: {json_str}")
         structured_data = json.loads(json_str, parse_float=custom_decimal_parser)
         structured_data = convert_scientific_decimals(structured_data)
         structured_data = force_decimal_format(structured_data)
         structured_data_cache[text_hash] = structured_data
         logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
+        # Log structured data with custom encoder to avoid scientific notation in logs
+        log_friendly_data = json.dumps(structured_data, default=custom_encoder)
         logger.info(f"Final Structured Data (JSON format): {log_friendly_data}")
         return structured_data
     except Exception as e:
         logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
+                        images = convert_from_bytes(file_bytes, dpi=150)
                         logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
                         ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
         output_data["success"] = False
     logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
     output_data = force_decimal_format(output_data)
+    encoded_data = json.dumps(output_data, default=custom_encoder)
+    return JSONResponse(content=json.loads(encoded_data))