Spaces:
Running
Running
File size: 3,442 Bytes
fad436e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | import os
import base64
import json
import logging
import requests
import cv2
from typing import Dict, Any
from .base import BaseVLM
class GroqVLMEngine(BaseVLM):
def __init__(self, model="meta-llama/llama-4-scout-17b-16e-instruct"):
self.api_key = os.getenv("GROQ_API_KEY")
self.url = "https://api.groq.com/openai/v1/chat/completions"
self.model = model
if not self.api_key:
logging.warning("GROQ_API_KEY missing from environment. VLM extraction will be skipped.")
def image_to_base64(self, image_path: str) -> str:
try:
img = cv2.imread(image_path)
if img is None:
logging.error(f"VLM: Image not found at {image_path}")
return ""
_, buffer = cv2.imencode(".jpg", img)
return base64.b64encode(buffer).decode("utf-8")
except Exception as e:
logging.error(f"VLM: Error converting image to base64: {e}")
return ""
def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]:
if not self.api_key:
return {}
logging.info(f"VLM: Starting extraction for {os.path.basename(image_path)} using {self.model}")
base64_image = self.image_to_base64(image_path)
if not base64_image:
return {}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
payload = {
"model": self.model,
"messages": [
{
"role": "system",
"content": "You are a strict information extraction engine for business cards. Return only valid JSON. Do not include any other text."
},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"response_format": {"type": "json_object"},
"temperature": 0.1
}
try:
resp = requests.post(self.url, headers=headers, json=payload, timeout=60)
if resp.status_code != 200:
logging.error(f"VLM API Error: {resp.status_code} - {resp.text}")
return {}
content = resp.json()["choices"][0]["message"]["content"]
data = json.loads(content)
logging.info(f"VLM: Successfully extracted structured data from {os.path.basename(image_path)}")
return data
except requests.exceptions.Timeout:
logging.error("VLM: Request timed out.")
return {}
except Exception as e:
logging.error(f"VLM: Unexpected error: {e}")
return {}
def process(self, image_path: str) -> Dict[str, Any]:
prompt = """
Extract structured text from this business card and return ONLY valid JSON.
Fields: Name, Designation, Company, Contact, Address, Email, Link.
Every value must be a JSON array. If not found, use [].
"""
return self.extract_structured_data(image_path, prompt)
|