ImageDataExtractor2

Running

App Files Files Community

ImageDataExtractor2 / core /vlm_engine.py

WebashalarForML

Upload 42 files

fad436e verified 4 days ago

raw

history blame contribute delete

3.44 kB

	import os
	import base64
	import json
	import logging
	import requests
	import cv2
	from typing import Dict, Any
	from .base import BaseVLM

	class GroqVLMEngine(BaseVLM):
	def __init__(self, model="meta-llama/llama-4-scout-17b-16e-instruct"):
	self.api_key = os.getenv("GROQ_API_KEY")
	self.url = "https://api.groq.com/openai/v1/chat/completions"
	self.model = model
	if not self.api_key:
	logging.warning("GROQ_API_KEY missing from environment. VLM extraction will be skipped.")

	def image_to_base64(self, image_path: str) -> str:
	try:
	img = cv2.imread(image_path)
	if img is None:
	logging.error(f"VLM: Image not found at {image_path}")
	return ""
	_, buffer = cv2.imencode(".jpg", img)
	return base64.b64encode(buffer).decode("utf-8")
	except Exception as e:
	logging.error(f"VLM: Error converting image to base64: {e}")
	return ""

	def extract_structured_data(self, image_path: str, prompt: str) -> Dict[str, Any]:
	if not self.api_key:
	return {}

	logging.info(f"VLM: Starting extraction for {os.path.basename(image_path)} using {self.model}")
	base64_image = self.image_to_base64(image_path)
	if not base64_image:
	return {}

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.api_key}"
	}

	payload = {
	"model": self.model,
	"messages": [
	{
	"role": "system",
	"content": "You are a strict information extraction engine for business cards. Return only valid JSON. Do not include any other text."
	},
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}"
	}
	}
	]
	}
	],
	"response_format": {"type": "json_object"},
	"temperature": 0.1
	}

	try:
	resp = requests.post(self.url, headers=headers, json=payload, timeout=60)
	if resp.status_code != 200:
	logging.error(f"VLM API Error: {resp.status_code} - {resp.text}")
	return {}

	content = resp.json()["choices"][0]["message"]["content"]
	data = json.loads(content)
	logging.info(f"VLM: Successfully extracted structured data from {os.path.basename(image_path)}")
	return data
	except requests.exceptions.Timeout:
	logging.error("VLM: Request timed out.")
	return {}
	except Exception as e:
	logging.error(f"VLM: Unexpected error: {e}")
	return {}

	def process(self, image_path: str) -> Dict[str, Any]:
	prompt = """
	Extract structured text from this business card and return ONLY valid JSON.
	Fields: Name, Designation, Company, Contact, Address, Email, Link.
	Every value must be a JSON array. If not found, use [].
	"""
	return self.extract_structured_data(image_path, prompt)