Spaces:

satvaSolutions
/

pdf-ocr

Sleeping

App Files Files Community

ChintanSatva commited on Jul 29, 2025

Commit

1404125

verified ·

1 Parent(s): f12bf00

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -47

app.py CHANGED Viewed

@@ -1,4 +1,8 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 import pytesseract
 import cv2
 import os
@@ -8,7 +12,7 @@ import unicodedata
 from pdf2image import convert_from_bytes
 from pypdf import PdfReader
 import numpy as np
-from typing import List
 import io
 import logging
 import time
@@ -18,7 +22,22 @@ import cachetools
 import hashlib
 import google.generativeai as genai
 from dotenv import load_dotenv
-from decimal import Decimal
 app = FastAPI()
@@ -35,9 +54,10 @@ if not api_key:
     logger.error("GOOGLE_API_KEY not set")
     raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
 genai.configure(api_key=api_key)
-model = genai.GenerativeModel("gemini-2.5-flash")
-# Set Tesseract path
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 # In-memory caches (1-hour TTL)
@@ -67,7 +87,7 @@ async def process_image(img_bytes, filename, idx):
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
         img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng+ara'  # Reduced for performance
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
         logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
@@ -83,7 +103,7 @@ async def process_pdf_page(img, page_idx):
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
         img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
-        custom_config = r'--oem 1 --psm 6 -l eng+ara'  # Reduced for performance
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
         logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
@@ -96,16 +116,14 @@ async def process_with_gemini(filename: str, raw_text: str):
     start_time = time.time()
     logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")
-    # Check structured data cache
     text_hash = get_text_hash(raw_text)
     if text_hash in structured_data_cache:
         logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
         return structured_data_cache[text_hash]
-    # Truncate text for Gemini
-    if len(raw_text) > 10000:
-        raw_text = raw_text[:10000]
-        logger.info(f"Truncated raw text for {filename} to 10000 characters, {log_memory_usage()}")
     try:
         prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
@@ -117,6 +135,11 @@ async def process_with_gemini(filename: str, raw_text: str):
 - The 'items' list may have multiple entries, each with detailed attributes.
 - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
 - Convert any date found in format :  YYYY-MM-DD
 Raw text:
 {raw_text}
@@ -189,10 +212,18 @@ Output JSON:
         json_start = llm_output.find("{")
         json_end = llm_output.rfind("}") + 1
         json_str = llm_output[json_start:json_end]
         structured_data = json.loads(json_str, parse_float=Decimal)
         structured_data_cache[text_hash] = structured_data
         logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
-        print("Structured Data", structured_data)
         return structured_data
     except Exception as e:
         logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
@@ -200,7 +231,7 @@ Output JSON:
 @app.post("/ocr")
 async def extract_and_structure(files: List[UploadFile] = File(...)):
-    output_json = {
         "success": True,
         "message": "",
         "data": []
@@ -214,12 +245,11 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
         total_start_time = time.time()
         logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
-        # Validate file format
         valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
         file_ext = os.path.splitext(file.filename.lower())[1]
         if file_ext not in valid_extensions:
             fail_count += 1
-            output_json["data"].append({
                 "filename": file.filename,
                 "structured_data": {"error": f"Unsupported file format: {file_ext}"},
                 "error": f"Unsupported file format: {file_ext}"
@@ -227,7 +257,6 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
             logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
             continue
-        # Read file into memory
         try:
             file_start_time = time.time()
             file_bytes = await file.read()
@@ -236,7 +265,7 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
             logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
         except Exception as e:
             fail_count += 1
-            output_json["data"].append({
                 "filename": file.filename,
                 "structured_data": {"error": f"Failed to read file: {str(e)}"},
                 "error": f"Failed to read file: {str(e)}"
@@ -244,14 +273,12 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
             logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
             continue
-        # Check raw text cache
         raw_text = ""
         if file_hash in raw_text_cache:
             raw_text = raw_text_cache[file_hash]
             logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
         else:
             if file_ext == '.pdf':
-                # Try extracting embedded text
                 try:
                     extract_start_time = time.time()
                     reader = PdfReader(file_stream)
@@ -263,68 +290,67 @@ async def extract_and_structure(files: List[UploadFile] = File(...)):
                 except Exception as e:
                     logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-                # If no embedded text, perform OCR
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
-                        images = convert_from_bytes(file_bytes, poppler_path="/usr/local/bin", dpi=100)
                         logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
-                        ocr_start_time = time.time()
-                        page_texts = []
-                        for i, img in enumerate(images):
-                            page_text = await process_pdf_page(img, i)
-                            page_texts.append(page_text)
                         raw_text = "".join(page_texts)
-                        logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
                     except Exception as e:
                         fail_count += 1
-                        output_json["data"].append({
                             "filename": file.filename,
                             "structured_data": {"error": f"OCR failed: {str(e)}"},
                             "error": f"OCR failed: {str(e)}"
                         })
                         logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                         continue
-            else:  # JPG/JPEG/PNG
                 try:
-                    ocr_start_time = time.time()
                     raw_text = await process_image(file_bytes, file.filename, 0)
-                    logger.info(f"Image OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
                 except Exception as e:
                     fail_count += 1
-                    output_json["data"].append({
                         "filename": file.filename,
                         "structured_data": {"error": f"Image OCR failed: {str(e)}"},
                         "error": f"Image OCR failed: {str(e)}"
                     })
                     logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                     continue
-            # Normalize text
-            try:
-                normalize_start_time = time.time()
                 raw_text = unicodedata.normalize('NFKC', raw_text)
-                raw_text = raw_text.encode().decode('utf-8')
                 raw_text_cache[file_hash] = raw_text
-                logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
-            except Exception as e:
-                logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
-        # Process with Gemini
         structured_data = await process_with_gemini(file.filename, raw_text)
-        success_count += 1
-        output_json["data"].append({
             "filename": file.filename,
             "structured_data": structured_data,
-            "error": ""
         })
         logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
-    output_json["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
     if fail_count > 0 and success_count == 0:
-        output_json["success"] = False
     logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
-    return output_json

 from fastapi import FastAPI, File, UploadFile, HTTPException
+# Import Decimal and the custom JSONResponse
+from decimal import Decimal
+from fastapi.encoders import jsonable_encoder
+from starlette.responses import JSONResponse
 import pytesseract
 import cv2
 import os
 from pdf2image import convert_from_bytes
 from pypdf import PdfReader
 import numpy as np
+from typing import List, Any
 import io
 import logging
 import time
 import hashlib
 import google.generativeai as genai
 from dotenv import load_dotenv
+# --- START OF MODIFICATIONS ---
+# 1. Define a custom JSON encoder function
+# This function checks if an object is of type Decimal. If it is, it converts it
+# to a string to preserve its exact formatting. Otherwise, it lets the default
+# encoder handle it. This prevents FastAPI from converting Decimals back to floats.
+def custom_encoder(obj: Any) -> Any:
+    if isinstance(obj, Decimal):
+        # By converting to a string, we ensure "0.0000009" is not turned into 9e-7
+        return str(obj)
+    # For any other type, fall back to the default encoder
+    return jsonable_encoder(obj)
+# --- END OF MODIFICATIONS ---
 app = FastAPI()
     logger.error("GOOGLE_API_KEY not set")
     raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
 genai.configure(api_key=api_key)
+model = genai.GenerativeModel("gemini-1.5-pro-latest") # Using a recommended model
+# Set Tesseract path (adjust if necessary)
+# For Docker/Linux, this is often the correct path. For Windows/macOS, it will differ.
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 # In-memory caches (1-hour TTL)
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
         img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
+        custom_config = r'--oem 1 --psm 6 -l eng+ara'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
         logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
         img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
         img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
+        custom_config = r'--oem 1 --psm 6 -l eng+ara'
         page_text = pytesseract.image_to_string(img_pil, config=custom_config)
         logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
         return page_text + "\n"
     start_time = time.time()
     logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")
     text_hash = get_text_hash(raw_text)
     if text_hash in structured_data_cache:
         logger.info(f"Structured data cache hit for {filename}, {log_memory_usage()}")
         return structured_data_cache[text_hash]
+    if len(raw_text) > 20000: # Increased limit slightly
+        raw_text = raw_text[:20000]
+        logger.info(f"Truncated raw text for {filename} to 20000 characters, {log_memory_usage()}")
     try:
         prompt = f"""You are an intelligent invoice data extractor. Given raw text from an invoice (in English or other languages),
 - The 'items' list may have multiple entries, each with detailed attributes.
 - If a field is missing or not found, return an empty value (`""` or `0`) and set `accuracy` to `0.0`.
 - Convert any date found in format :  YYYY-MM-DD
+- For Unit Price Format all numbers in standard decimal notation without exponents:
+- Correct: 0.0000009, 1500000, 0.00123
+- Incorrect: 9e-7, 1.5e+6, 1.23e-3
+This applies to all calculations, measurements, and numeric results in your response.
 Raw text:
 {raw_text}
         json_start = llm_output.find("{")
         json_end = llm_output.rfind("}") + 1
         json_str = llm_output[json_start:json_end]
+        # --- START OF MODIFICATIONS ---
+        # 2. Use `parse_float=Decimal` when loading the JSON string.
+        # This converts numbers like 0.0000009 into a high-precision Decimal
+        # object inside Python, instead of a standard float.
         structured_data = json.loads(json_str, parse_float=Decimal)
+        # --- END OF MODIFICATIONS ---
         structured_data_cache[text_hash] = structured_data
         logger.info(f"Gemini processing for {filename}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
+        # This will now print Decimal('0.0000009') in your console, which is correct
+        logger.info(f"Structured Data: {structured_data}")
         return structured_data
     except Exception as e:
         logger.error(f"Gemini processing failed for {filename}: {str(e)}, {log_memory_usage()}")
 @app.post("/ocr")
 async def extract_and_structure(files: List[UploadFile] = File(...)):
+    output_data = {
         "success": True,
         "message": "",
         "data": []
         total_start_time = time.time()
         logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
         valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
         file_ext = os.path.splitext(file.filename.lower())[1]
         if file_ext not in valid_extensions:
             fail_count += 1
+            output_data["data"].append({
                 "filename": file.filename,
                 "structured_data": {"error": f"Unsupported file format: {file_ext}"},
                 "error": f"Unsupported file format: {file_ext}"
             logger.error(f"Unsupported file format for {file.filename}: {file_ext}")
             continue
         try:
             file_start_time = time.time()
             file_bytes = await file.read()
             logger.info(f"Read file {file.filename}, took {time.time() - file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB, {log_memory_usage()}")
         except Exception as e:
             fail_count += 1
+            output_data["data"].append({
                 "filename": file.filename,
                 "structured_data": {"error": f"Failed to read file: {str(e)}"},
                 "error": f"Failed to read file: {str(e)}"
             logger.error(f"Failed to read file {file.filename}: {str(e)}, {log_memory_usage()}")
             continue
         raw_text = ""
         if file_hash in raw_text_cache:
             raw_text = raw_text_cache[file_hash]
             logger.info(f"Raw text cache hit for {file.filename}, {log_memory_usage()}")
         else:
             if file_ext == '.pdf':
                 try:
                     extract_start_time = time.time()
                     reader = PdfReader(file_stream)
                 except Exception as e:
                     logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                 if not raw_text.strip():
                     try:
                         convert_start_time = time.time()
+                        images = convert_from_bytes(file_bytes, dpi=150) # Use 150 dpi as a balance
                         logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
+                        ocr_tasks = [process_pdf_page(img, i) for i, img in enumerate(images)]
+                        page_texts = await asyncio.gather(*ocr_tasks)
                         raw_text = "".join(page_texts)
+                        logger.info(f"Total OCR for {file.filename}, text length: {len(raw_text)}, {log_memory_usage()}")
                     except Exception as e:
                         fail_count += 1
+                        output_data["data"].append({
                             "filename": file.filename,
                             "structured_data": {"error": f"OCR failed: {str(e)}"},
                             "error": f"OCR failed: {str(e)}"
                         })
                         logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                         continue
+            else:
                 try:
                     raw_text = await process_image(file_bytes, file.filename, 0)
+                    logger.info(f"Image OCR for {file.filename}, text length: {len(raw_text)}, {log_memory_usage()}")
                 except Exception as e:
                     fail_count += 1
+                    output_data["data"].append({
                         "filename": file.filename,
                         "structured_data": {"error": f"Image OCR failed: {str(e)}"},
                         "error": f"Image OCR failed: {str(e)}"
                     })
                     logger.error(f"Image OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
                     continue
+            if raw_text:
                 raw_text = unicodedata.normalize('NFKC', raw_text)
                 raw_text_cache[file_hash] = raw_text
         structured_data = await process_with_gemini(file.filename, raw_text)
+        if "error" not in structured_data:
+            success_count += 1
+        else:
+            fail_count += 1
+        output_data["data"].append({
             "filename": file.filename,
             "structured_data": structured_data,
+            "error": structured_data.get("error", "")
         })
         logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
+    output_data["message"] = f"Processed {len(files)} files. {success_count} succeeded, {fail_count} failed."
     if fail_count > 0 and success_count == 0:
+        output_data["success"] = False
     logger.info(f"Completed processing for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
+    # --- START OF MODIFICATIONS ---
+    # 3. Use the custom encoder when returning the final JSON response.
+    # This ensures the Decimal objects are converted to strings correctly.
+    # The default JSONResponse will not handle Decimal types properly.
+    encoded_data = custom_encoder(output_data)
+    return JSONResponse(content=encoded_data)
+    # --- END OF MODIFICATIONS ---