Spaces:
Running
Running
| """ | |
| OCR Service - Document Text Extraction via OpenRouter Vision Models | |
| Handles OCR for images and scanned documents using vision-capable models with fallback | |
| """ | |
| import requests | |
| import base64 | |
| from pathlib import Path | |
| from config import Config | |
| class OCRService: | |
| def __init__(self): | |
| self.api_key = Config.OPENROUTER_API_KEY | |
| self.base_url = Config.OPENROUTER_BASE_URL | |
| # Vision-capable models for OCR with fallback order | |
| # Only models that support image/vision input can be used for OCR | |
| self.ocr_models = [ | |
| "google/gemma-3-27b-it:free", # Primary - Largest Gemma 3 | |
| "google/gemma-3-12b-it:free", # Fallback 1 | |
| "google/gemma-3-4b-it:free", # Fallback 2 | |
| "google/gemma-3n-e4b-it:free", # Fallback 3 | |
| "google/gemma-3n-e2b-it:free", # Fallback 4 - Smallest | |
| ] | |
| def _encode_image(self, image_path: str) -> str: | |
| """Encode image to base64""" | |
| with open(image_path, "rb") as f: | |
| return base64.b64encode(f.read()).decode('utf-8') | |
| def _get_mime_type(self, file_path: str) -> str: | |
| """Get MIME type from file extension""" | |
| ext = Path(file_path).suffix.lower() | |
| mime_types = { | |
| '.png': 'image/png', | |
| '.jpg': 'image/jpeg', | |
| '.jpeg': 'image/jpeg', | |
| '.gif': 'image/gif', | |
| '.webp': 'image/webp', | |
| '.pdf': 'application/pdf' | |
| } | |
| return mime_types.get(ext, 'image/png') | |
| def _call_ocr_model(self, image_data: str, mime_type: str, model: str = None) -> dict: | |
| """Call OpenRouter vision model for OCR""" | |
| if not self.api_key: | |
| return {"success": False, "error": "OpenRouter API key not configured"} | |
| headers = { | |
| "Authorization": f"Bearer {self.api_key}", | |
| "Content-Type": "application/json", | |
| "HTTP-Referer": "https://notebooklm-fast.hf.space", | |
| "X-Title": "NotebookLM Fast OCR" | |
| } | |
| # OCR prompt for thorough text extraction | |
| ocr_prompt = """You are a precise OCR system. Extract EVERY SINGLE piece of text from this image/document with 100% accuracy. | |
| CRITICAL INSTRUCTIONS: | |
| 1. Extract ALL text - do not skip or miss ANY section, heading, paragraph, or text block | |
| 2. Include ALL sections (e.g., Education, Experience, Skills, Contact, Summary, Projects, etc.) | |
| 3. Preserve the exact structure and hierarchy of the document | |
| 4. Include all names, dates, numbers, addresses, phone numbers, emails, URLs | |
| 5. Include text from headers, footers, sidebars, and any text boxes | |
| 6. For tables, use markdown table format with all rows and columns | |
| 7. For bullet points and lists, preserve the list structure | |
| 8. Include any small text, footnotes, or captions | |
| OUTPUT FORMAT: | |
| - Return ONLY the extracted text, explanations | |
| - Maintain the original reading order (top to bottom, left to right) | |
| - Use markdown formatting for structure (headers, lists, tables) | |
| - Separate sections clearly with line breaks | |
| IMPORTANT: Do not summarize or paraphrase. Extract the EXACT text as it appears.""" | |
| payload = { | |
| "model": model or self.ocr_models[0], | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:{mime_type};base64,{image_data}" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": ocr_prompt | |
| } | |
| ] | |
| } | |
| ], | |
| "max_tokens": 4096, | |
| "temperature": 0.1 # Low temperature for accurate extraction | |
| } | |
| try: | |
| response = requests.post( | |
| f"{self.base_url}/chat/completions", | |
| headers=headers, | |
| json=payload, | |
| timeout=120 # Longer timeout for OCR | |
| ) | |
| if response.status_code == 200: | |
| data = response.json() | |
| text = data.get('choices', [{}])[0].get('message', {}).get('content', '') | |
| if text: | |
| return {"success": True, "text": text, "model": model or self.ocr_models[0]} | |
| else: | |
| return {"success": False, "error": "No text extracted from response"} | |
| else: | |
| return { | |
| "success": False, | |
| "error": f"OpenRouter API error: {response.status_code} - {response.text}" | |
| } | |
| except requests.exceptions.Timeout: | |
| return {"success": False, "error": "Request timed out. Please try again."} | |
| except Exception as e: | |
| return {"success": False, "error": str(e)} | |
| def _call_ocr_with_fallback(self, image_data: str, mime_type: str) -> dict: | |
| """Try OCR with fallback models""" | |
| last_error = None | |
| for model in self.ocr_models: | |
| print(f"Attempting OCR with {model}...") | |
| result = self._call_ocr_model(image_data, mime_type, model) | |
| if result['success']: | |
| print(f"OCR successful with {model}") | |
| return result | |
| else: | |
| last_error = result.get('error', 'Unknown error') | |
| print(f"OCR failed with {model}: {last_error}") | |
| continue | |
| return {"success": False, "error": f"All OCR models failed. Last error: {last_error}"} | |
| def extract_text_from_pdf(self, pdf_path: str) -> dict: | |
| """ | |
| Extract text from entire PDF using OpenRouter vision models. | |
| Converts PDF pages to images and processes them. | |
| """ | |
| import fitz # PyMuPDF | |
| import os | |
| try: | |
| doc = fitz.open(pdf_path) | |
| total_pages = len(doc) | |
| print(f"Processing {total_pages} page PDF with OpenRouter vision OCR...") | |
| all_text = [] | |
| for page_num in range(total_pages): | |
| print(f"Processing page {page_num + 1}/{total_pages}...") | |
| page = doc.load_page(page_num) | |
| # Render page to image at good resolution for OCR | |
| mat = fitz.Matrix(2, 2) # 2x zoom for better quality | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("png") | |
| # Encode to base64 | |
| image_base64 = base64.b64encode(img_data).decode('utf-8') | |
| # OCR the page with fallback | |
| result = self._call_ocr_with_fallback(image_base64, 'image/png') | |
| if result['success']: | |
| all_text.append(f"--- Page {page_num + 1} ---\n{result['text']}") | |
| else: | |
| all_text.append(f"--- Page {page_num + 1} ---\n[Error extracting text: {result['error']}]") | |
| doc.close() | |
| combined_text = "\n\n".join(all_text) | |
| return { | |
| "success": True, | |
| "text": combined_text, | |
| "model": "OpenRouter Vision OCR" | |
| } | |
| except Exception as e: | |
| return {"success": False, "error": f"Error processing PDF: {str(e)}"} | |
| def _process_pdf_in_batches(self, pdf_path: str, total_pages: int) -> dict: | |
| """Split PDF into chunks and process sequentially - kept for compatibility""" | |
| return self.extract_text_from_pdf(pdf_path) | |
| def _send_pdf_to_api(self, pdf_path: str) -> dict: | |
| """Process PDF by converting to images - OpenRouter doesn't have native PDF support""" | |
| return self.extract_text_from_pdf(pdf_path) | |
| def extract_text(self, image_path: str) -> dict: | |
| """ | |
| Extract text from image using OpenRouter vision models with fallback | |
| """ | |
| image_data = self._encode_image(image_path) | |
| mime_type = self._get_mime_type(image_path) | |
| print(f"Attempting OCR with OpenRouter vision models...") | |
| result = self._call_ocr_with_fallback(image_data, mime_type) | |
| if result['success']: | |
| print(f"OCR successful with {result.get('model', 'OpenRouter')}") | |
| else: | |
| print(f"OCR failed: {result['error']}") | |
| return result | |
| def extract_text_from_pdf_page(self, page_image_data: bytes, | |
| page_num: int) -> dict: | |
| """Extract text from a PDF page image""" | |
| image_data = base64.b64encode(page_image_data).decode('utf-8') | |
| print(f"Extracting text from PDF page {page_num} with OpenRouter vision OCR...") | |
| result = self._call_ocr_with_fallback(image_data, 'image/png') | |
| return result | |
| # Singleton instance | |
| ocr_service = OCRService() | |