import os import base64 import json import logging import requests import cv2 from typing import Dict, Any from .base import BaseVLM class GroqVLMEngine(BaseVLM): def __init__(self, model="meta-llama/llama-4-scout-17b-16e-instruct"): self.api_key = os.getenv("GROQ_API_KEY") self.url = "https://api.groq.com/openai/v1/chat/completions" self.model = model if not self.api_key: logging.warning("GROQ_API_KEY missing from environment. VLM extraction will be skipped.") def image_to_base64(self, image_path: str) -> str: try: img = cv2.imread(image_path) if img is None: logging.error(f"VLM: Image not found at {image_path}") return "" _, buffer = cv2.imencode(".jpg", img) return base64.b64encode(buffer).decode("utf-8") except Exception as e: logging.error(f"VLM: Error converting image to base64: {e}") return "" def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]: if not self.api_key: return {} logging.info(f"VLM: Starting extraction for {os.path.basename(image_path)} using {self.model}") base64_image = self.image_to_base64(image_path) if not base64_image: return {} headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } payload = { "model": self.model, "messages": [ { "role": "system", "content": "You are a strict information extraction engine for business cards. Return only valid JSON. Do not include any other text." }, { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "response_format": {"type": "json_object"}, "temperature": 0.1 } try: resp = requests.post(self.url, headers=headers, json=payload, timeout=60) if resp.status_code != 200: logging.error(f"VLM API Error: {resp.status_code} - {resp.text}") return {} content = resp.json()["choices"][0]["message"]["content"] data = json.loads(content) logging.info(f"VLM: Successfully extracted structured data from {os.path.basename(image_path)}") return data except requests.exceptions.Timeout: logging.error("VLM: Request timed out.") return {} except Exception as e: logging.error(f"VLM: Unexpected error: {e}") return {} def process(self, image_path: str) -> Dict[str, Any]: prompt = """ Extract structured text from this business card and return ONLY valid JSON. Fields: Name, Designation, Company, Contact, Address, Email, Link. Every value must be a JSON array. If not found, use []. """ return self.extract_structured_data(image_path, prompt)