Spaces:
Sleeping
Sleeping
| """ | |
| Advanced OCR with Gemini AI for 100% Accurate Receipt Parsing | |
| Combines Google Document AI OCR + Gemini AI interpretation for maximum accuracy | |
| """ | |
| import json | |
| import os | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| import google.generativeai as genai | |
| from google.api_core.client_options import ClientOptions | |
| from google.cloud import documentai_v1 as documentai | |
| from google.oauth2 import service_account | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # Configuration from environment variables | |
| PROJECT_ID = os.getenv("PROJECT_ID") | |
| LOCATION = os.getenv("LOCATION") | |
| PROCESSOR_ID = os.getenv("PROCESSOR_ID") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| # Handle Google Cloud credentials (for Hugging Face deployment) | |
| GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | |
| # Validate required environment variables | |
| if not all([PROJECT_ID, LOCATION, PROCESSOR_ID, GEMINI_API_KEY]): | |
| raise ValueError( | |
| "Missing required environment variables. " | |
| "Please ensure PROJECT_ID, LOCATION, PROCESSOR_ID, and GEMINI_API_KEY are set in .env file" | |
| ) | |
| class EnhancedReceiptOCR: | |
| """ | |
| Enhanced OCR processor combining Document AI + Gemini AI | |
| for 100% accurate receipt parsing | |
| """ | |
| def __init__(self, project_id: str, location: str, processor_id: str, gemini_api_key: str): | |
| self.project_id = project_id | |
| self.location = location | |
| self.processor_id = processor_id | |
| self.processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | |
| # Initialize Document AI client with credentials handling | |
| opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") | |
| # Handle credentials for Hugging Face deployment | |
| credentials = None | |
| if GOOGLE_CREDS: | |
| # Check if it's a file path or JSON content | |
| if GOOGLE_CREDS.strip().startswith('{'): | |
| # It's JSON content (Hugging Face secret) | |
| try: | |
| creds_dict = json.loads(GOOGLE_CREDS) | |
| credentials = service_account.Credentials.from_service_account_info(creds_dict) | |
| print("✓ Using Google Cloud credentials from JSON content") | |
| except json.JSONDecodeError: | |
| print("⚠ Warning: Failed to parse GOOGLE_APPLICATION_CREDENTIALS as JSON") | |
| elif Path(GOOGLE_CREDS).exists(): | |
| # It's a file path (local development) | |
| credentials = service_account.Credentials.from_service_account_file(GOOGLE_CREDS) | |
| print(f"✓ Using Google Cloud credentials from file: {GOOGLE_CREDS}") | |
| else: | |
| print(f"⚠ Warning: Credentials file not found: {GOOGLE_CREDS}") | |
| # Initialize Document AI client | |
| if credentials: | |
| self.docai_client = documentai.DocumentProcessorServiceClient( | |
| client_options=opts, | |
| credentials=credentials | |
| ) | |
| else: | |
| # Fall back to default credentials (uses GOOGLE_APPLICATION_CREDENTIALS env var) | |
| self.docai_client = documentai.DocumentProcessorServiceClient(client_options=opts) | |
| # Initialize Gemini AI | |
| genai.configure(api_key=gemini_api_key) | |
| self.gemini_model = genai.GenerativeModel('gemini-2.5-flash-lite') | |
| def process_with_document_ai(self, file_path: str, mime_type: str = None) -> str: | |
| """ | |
| Extract raw text using Google Document AI | |
| Args: | |
| file_path: Path to the receipt image/PDF | |
| mime_type: MIME type (auto-detected if None) | |
| Returns: | |
| Raw extracted text | |
| """ | |
| if mime_type is None: | |
| mime_type = self._detect_mime_type(file_path) | |
| with open(file_path, "rb") as image: | |
| image_content = image.read() | |
| raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type) | |
| request = documentai.ProcessRequest( | |
| name=self.processor_name, | |
| raw_document=raw_document | |
| ) | |
| result = self.docai_client.process_document(request=request) | |
| return result.document.text | |
| def process_with_gemini_vision(self, file_path: str, raw_text: str = None) -> Dict[str, Any]: | |
| """ | |
| Use Gemini AI to analyze receipt with both image and text | |
| (Image + Text mode for maximum accuracy) | |
| Args: | |
| file_path: Path to the receipt image | |
| raw_text: Raw OCR text from Document AI | |
| Returns: | |
| Structured receipt data as dictionary | |
| """ | |
| # Create the prompt for Gemini | |
| prompt = self._create_gemini_prompt(raw_text) | |
| # Read and prepare the image | |
| image_data = None | |
| if file_path and Path(file_path).exists(): | |
| try: | |
| with open(file_path, 'rb') as f: | |
| image_bytes = f.read() | |
| # Import PIL for image handling | |
| from PIL import Image | |
| import io | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| # Generate response with IMAGE + TEXT | |
| response = self.gemini_model.generate_content([prompt, image]) | |
| except Exception as e: | |
| print(f"Warning: Could not load image, falling back to text-only: {e}") | |
| # Fallback to text-only if image fails | |
| response = self.gemini_model.generate_content(prompt) | |
| else: | |
| # Text-only fallback | |
| response = self.gemini_model.generate_content(prompt) | |
| # Parse JSON from response | |
| return self._parse_gemini_response(response.text) | |
| def _create_gemini_prompt(self, raw_text: Optional[str] = None) -> str: | |
| """Create a detailed prompt for Gemini to extract receipt data""" | |
| prompt = """You are an expert at analyzing receipts and bills with 100% accuracy. | |
| Analyze the receipt image AND the OCR text below to extract ALL information into a perfectly structured JSON format. | |
| CRITICAL REQUIREMENTS: | |
| 1. Extract EVERY single item purchased - do not miss any items | |
| 2. For each item, extract: description, quantity, unit_price, and total_price | |
| 3. Do not confuse other numbers (like barcodes, phone numbers) with items | |
| 4. Extract merchant name, address, date, and time | |
| 5. Extract all financial details: subtotal, tax, discounts, total | |
| 6. If there are promotions or discounts, include them | |
| 7. Identify the currency used | |
| 8. Extract payment method if available | |
| 9. Use BOTH the image and the text below for maximum accuracy | |
| """ | |
| if raw_text: | |
| prompt += f""" | |
| RECEIPT TEXT (from OCR): | |
| {raw_text} | |
| Analyze BOTH the image and the text above carefully to ensure 100% accuracy. | |
| Cross-reference the visual information with the OCR text. | |
| """ | |
| else: | |
| prompt += """ | |
| RECEIPT TEXT (from OCR): | |
| (No text provided - analyze the image only) | |
| """ | |
| prompt += """ | |
| OUTPUT FORMAT (strict JSON, no markdown, no explanation): | |
| { | |
| "merchant_name": "name of store/restaurant/cafe", | |
| "merchant_address": "full address", | |
| "date": "YYYY-MM-DD", | |
| "time": "HH:MM:SS", | |
| "items": [ | |
| { | |
| "description": "item name/description", | |
| "quantity": 1, | |
| "unit_price": 0.00, | |
| "total_price": 0.00, | |
| "notes": "any promotion or special note" | |
| } | |
| ], | |
| "subtotal": 0.00, | |
| "tax": 0.00, | |
| "tax_rate": "X%", | |
| "discount": 0.00, | |
| "total": 0.00, | |
| "payment_method": "cash/card/etc", | |
| "currency": "USD/EUR/SGD/etc", | |
| "receipt_number": "receipt/invoice number", | |
| "additional_info": { | |
| "any": "other relevant information" | |
| } | |
| } | |
| Return ONLY the JSON object, nothing else. Ensure all numbers are properly formatted (use . for decimals, not ,). | |
| Convert any comma decimal separators (like 3,00) to period format (3.00). | |
| """ | |
| return prompt | |
| def _parse_gemini_response(self, response_text: str) -> Dict[str, Any]: | |
| """Parse JSON from Gemini's response""" | |
| # Remove markdown code blocks if present | |
| response_text = response_text.strip() | |
| # Remove ```json and ``` if present | |
| if response_text.startswith("```json"): | |
| response_text = response_text[7:] | |
| elif response_text.startswith("```"): | |
| response_text = response_text[3:] | |
| if response_text.endswith("```"): | |
| response_text = response_text[:-3] | |
| response_text = response_text.strip() | |
| try: | |
| return json.loads(response_text) | |
| except json.JSONDecodeError as e: | |
| print(f"Error parsing Gemini response: {e}") | |
| print(f"Response text: {response_text[:500]}") | |
| return {"error": "Failed to parse Gemini response", "raw_response": response_text} | |
| def _detect_mime_type(self, file_path: str) -> str: | |
| """Detect MIME type from file extension""" | |
| extension = Path(file_path).suffix.lower() | |
| mime_types = { | |
| '.pdf': 'application/pdf', | |
| '.jpg': 'image/jpeg', | |
| '.jpeg': 'image/jpeg', | |
| '.png': 'image/png', | |
| '.tiff': 'image/tiff', | |
| '.tif': 'image/tiff', | |
| '.gif': 'image/gif', | |
| '.bmp': 'image/bmp', | |
| '.webp': 'image/webp' | |
| } | |
| return mime_types.get(extension, 'application/octet-stream') | |
| def process_receipt( | |
| self, | |
| file_path: str, | |
| output_json_path: Optional[str] = None, | |
| save_json: bool = True | |
| ) -> Dict[str, Any]: | |
| """ | |
| Complete pipeline: Document AI OCR + Gemini AI interpretation | |
| Args: | |
| file_path: Path to receipt image/PDF | |
| output_json_path: Optional path to save JSON output | |
| save_json: Whether to save JSON output (set False for API usage) | |
| Returns: | |
| Structured receipt data | |
| """ | |
| print(f"Processing receipt: {file_path}") | |
| # Step 1: Extract text with Document AI | |
| print("Step 1: Extracting text with Document AI...") | |
| raw_text = self.process_with_document_ai(file_path) | |
| print(f"Document AI extracted {len(raw_text)} characters") | |
| # Step 2: Analyze with Gemini AI | |
| print("Step 2: Analyzing with Gemini AI for perfect interpretation...") | |
| receipt_data = self.process_with_gemini_vision(file_path, raw_text) | |
| # Add processing metadata for cost tracking | |
| if "error" not in receipt_data: | |
| item_count = len(receipt_data.get("items", [])) | |
| print(f"✓ Extraction complete! Found {item_count} items.") | |
| # Add metadata (won't be saved to database, used for cost calculation) | |
| receipt_data["_processing_metadata"] = { | |
| "raw_text_length": len(raw_text), | |
| "raw_text": raw_text, | |
| "includes_image": True # We send image to Gemini | |
| } | |
| else: | |
| print("⚠ Extraction encountered issues.") | |
| # Save to JSON only if requested | |
| if save_json: | |
| # Auto-generate output filename if not provided | |
| if not output_json_path: | |
| file_stem = Path(file_path).stem | |
| # Use /tmp on Hugging Face, current directory otherwise | |
| if os.path.exists("/app"): | |
| output_json_path = f"/tmp/{file_stem}_receipt.json" | |
| else: | |
| output_json_path = f"{file_stem}_receipt.json" | |
| try: | |
| with open(output_json_path, 'w', encoding='utf-8') as f: | |
| json.dump(receipt_data, f, indent=2, ensure_ascii=False) | |
| print(f"✓ Results saved to: {output_json_path}") | |
| except (PermissionError, OSError) as e: | |
| print(f"⚠ Warning: Could not save JSON file: {e}") | |
| return receipt_data | |
| def main(): | |
| """Example usage""" | |
| import sys | |
| if len(sys.argv) < 2: | |
| print("Usage: python ocr_with_gemini.py <path_to_receipt_image> [output.json]") | |
| print("Example: python ocr_with_gemini.py receipt.jpg") | |
| print(" python ocr_with_gemini.py receipt.jpg custom_output.json") | |
| print("\nNote: If output file is not specified, it will auto-generate as <filename>_receipt.json") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| output_file = sys.argv[2] if len(sys.argv) > 2 else None | |
| # Initialize enhanced OCR processor | |
| ocr = EnhancedReceiptOCR( | |
| project_id=PROJECT_ID, | |
| location=LOCATION, | |
| processor_id=PROCESSOR_ID, | |
| gemini_api_key=GEMINI_API_KEY | |
| ) | |
| try: | |
| # Process receipt (auto-saves JSON) | |
| receipt_data = ocr.process_receipt(input_file, output_json_path=output_file) | |
| # Display results | |
| print("\n" + "="*70) | |
| print("ENHANCED OCR RESULTS (Document AI + Gemini 1.5 Flash)") | |
| print("="*70) | |
| print(json.dumps(receipt_data, indent=2, ensure_ascii=False)) | |
| except Exception as e: | |
| print(f"Error processing receipt: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |