Spaces:
Running
Running
| import os | |
| import base64 | |
| import json | |
| import logging | |
| import requests | |
| import cv2 | |
| from typing import Dict, Any | |
| from .base import BaseVLM | |
| class GroqVLMEngine(BaseVLM): | |
| def __init__(self, model="meta-llama/llama-4-scout-17b-16e-instruct"): | |
| self.api_key = os.getenv("GROQ_API_KEY") | |
| self.url = "https://api.groq.com/openai/v1/chat/completions" | |
| self.model = model | |
| if not self.api_key: | |
| logging.warning("GROQ_API_KEY missing from environment. VLM extraction will be skipped.") | |
| def image_to_base64(self, image_path: str) -> str: | |
| try: | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| logging.error(f"VLM: Image not found at {image_path}") | |
| return "" | |
| _, buffer = cv2.imencode(".jpg", img) | |
| return base64.b64encode(buffer).decode("utf-8") | |
| except Exception as e: | |
| logging.error(f"VLM: Error converting image to base64: {e}") | |
| return "" | |
| def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]: | |
| if not self.api_key: | |
| return {} | |
| logging.info(f"VLM: Starting extraction for {os.path.basename(image_path)} using {self.model}") | |
| base64_image = self.image_to_base64(image_path) | |
| if not base64_image: | |
| return {} | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {self.api_key}" | |
| } | |
| payload = { | |
| "model": self.model, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are a strict information extraction engine for business cards. Return only valid JSON. Do not include any other text." | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{base64_image}" | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| "response_format": {"type": "json_object"}, | |
| "temperature": 0.1 | |
| } | |
| try: | |
| resp = requests.post(self.url, headers=headers, json=payload, timeout=60) | |
| if resp.status_code != 200: | |
| logging.error(f"VLM API Error: {resp.status_code} - {resp.text}") | |
| return {} | |
| content = resp.json()["choices"][0]["message"]["content"] | |
| data = json.loads(content) | |
| logging.info(f"VLM: Successfully extracted structured data from {os.path.basename(image_path)}") | |
| return data | |
| except requests.exceptions.Timeout: | |
| logging.error("VLM: Request timed out.") | |
| return {} | |
| except Exception as e: | |
| logging.error(f"VLM: Unexpected error: {e}") | |
| return {} | |
| def process(self, image_path: str) -> Dict[str, Any]: | |
| prompt = """ | |
| Extract structured text from this business card and return ONLY valid JSON. | |
| Fields: Name, Designation, Company, Contact, Address, Email, Link. | |
| Every value must be a JSON array. If not found, use []. | |
| """ | |
| return self.extract_structured_data(image_path, prompt) | |