import os import base64 import cv2 import numpy as np import fitz # PyMuPDF from dotenv import load_dotenv from groq import Groq # Load environment variables load_dotenv() API_KEY = os.getenv("GROQ_API_KEY") # Fetch API key from environment class OCRProcessor: def __init__(self, model="llama-3.2-90b-vision-preview"): if not API_KEY: raise ValueError("GROQ_API_KEY is missing! Please set it as an environment variable.") self.model = model self.client = Groq(api_key=API_KEY) def enhance_image(self, input_path, output_path): """ Enhances the quality of an image for OCR processing. """ if not os.path.exists(input_path): raise FileNotFoundError(f"File not found: {input_path}") image = cv2.imread(input_path) if image is None: raise ValueError(f"Could not process image: {input_path}") cv2.imwrite(output_path, image) return output_path def convert_pdf_to_images(self, pdf_path, save_dir="./uploads"): """ Converts a PDF to images and returns the image file paths. """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF file not found: {pdf_path}") os.makedirs(save_dir, exist_ok=True) doc = fitz.open(pdf_path) image_paths = [] for page_idx in range(len(doc)): page = doc.load_page(page_idx) img = page.get_pixmap() image_file = os.path.join(save_dir, f"page_{page_idx + 1}.png") img.save(image_file) if not os.path.exists(image_file): raise Exception(f"Failed to save image: {image_file}") image_paths.append(image_file) doc.close() return image_paths def encode_image(self, img_path): """ Encodes an image to base64 format after verifying its existence. """ if not os.path.exists(img_path): raise FileNotFoundError(f"File not found: {img_path}") try: with open(img_path, "rb") as img_file: encoded_data = base64.b64encode(img_file.read()).decode("utf-8") if not encoded_data or len(encoded_data) < 50: raise ValueError("Encoded image data is too short, possibly corrupted.") return encoded_data except Exception as e: raise Exception(f"Failed to encode image: {e}") def extract_text_from_image(self, encoded_img, prompt_text): """ Extracts text from an image using OCR. Ensures base64 encoding is valid. """ if not encoded_img or len(encoded_img) < 50: # Ensures valid base64 string raise ValueError("Invalid base64-encoded image data!") messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_img}"}} ] } ] try: response = self.client.chat.completions.create(model=self.model, messages=messages) return response.choices[0].message except Exception as err: raise Exception(f"OCR extraction failed: {err}")