decipherai-api / services /groq_vision_classifier.py
Akshay30's picture
Initial DecipherAI backend deployment
2f4af3f
import base64
import json
import os
from io import BytesIO
from PIL import Image
from groq import Groq
class GroqVisionScriptClassifier:
def __init__(self, groq_api_key):
self.groq_client = Groq(api_key=groq_api_key)
# FIXED: Use the correct stable model name
self.vision_model = "meta-llama/llama-4-scout-17b-16e-instruct"
print(f"[INFO] Groq Vision Classifier initialized with {self.vision_model}")
def classify_script(self, image_path):
"""Enhanced script classification including cuneiform using Groq's Llama Vision model"""
try:
# Convert image to base64
base64_image = self._image_to_base64(image_path)
if not base64_image:
return "unknown"
# Query Groq Vision API
response = self._query_groq_vision(base64_image)
# Parse the response
script_type = self._parse_classification_response(response)
print(f"[INFO] Llama Vision classified script as: {script_type}")
return script_type.lower()
except Exception as e:
print(f"[ERROR] Groq Vision script classification failed: {e}")
return "unknown"
def _image_to_base64(self, image_path):
"""Convert image to base64 for Groq Vision API (4MB limit)"""
try:
image = Image.open(image_path)
# Resize if too large (keep under 4MB base64 limit)
if max(image.size) > 1200:
image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
# Convert to base64 JPEG (smaller than PNG)
buffer = BytesIO()
image.save(buffer, format="JPEG", quality=90)
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
# Check size (base64 should be < 4MB)
if len(image_b64) > 4 * 1024 * 1024: # 4MB limit
# Reduce quality and try again
buffer = BytesIO()
image.save(buffer, format="JPEG", quality=70)
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
return image_b64
except Exception as e:
print(f"[ERROR] Image to base64 conversion failed: {e}")
return None
def _query_groq_vision(self, base64_image):
"""Enhanced query for Groq Llama Vision API including cuneiform"""
try:
# FIXED: Simplified prompt to avoid token limit issues
prompt = """Analyze this image of ancient text/script as an expert paleographer.
Classify it as ONE of these ancient script types:
- EGYPTIAN: Hieroglyphic symbols (birds, eyes, human figures, cartouches)
- GREEK: Ancient/medieval Greek alphabet (α,β,γ,δ,ε,ζ,η,θ) with diacritics
- LATIN: Latin alphabet letters, Roman inscriptions, medieval manuscripts
- CUNEIFORM: Wedge-shaped impressions on clay tablets (triangular marks)
IMPORTANT: Cuneiform has geometric wedge patterns, NOT pictures like hieroglyphs.
Respond ONLY with JSON:
{"classification": "EGYPTIAN" or "GREEK" or "LATIN" or "CUNEIFORM", "confidence": 0.0-1.0}"""
completion = self.groq_client.chat.completions.create(
model=self.vision_model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
temperature=0.1, # Low temperature for consistent classification
max_completion_tokens=100, # FIXED: Reduced to avoid token errors
top_p=0.9,
stream=False,
response_format={"type": "json_object"}
)
return completion.choices[0].message.content
except Exception as e:
print(f"[ERROR] Groq Vision API call failed: {e}")
return None
def _parse_classification_response(self, response):
"""Enhanced parsing for JSON response including cuneiform"""
if not response:
return "unknown"
try:
# Parse JSON response
data = json.loads(response)
classification = data.get('classification', '').upper()
confidence = data.get('confidence', 0.0)
print(f"[INFO] Vision model confidence: {confidence:.3f}")
# Enhanced classification mapping including cuneiform
if classification == "EGYPTIAN":
return "egyptian"
elif classification == "GREEK":
return "greek"
elif classification == "LATIN":
return "latin"
elif classification == "CUNEIFORM":
return "cuneiform"
else:
print(f"[WARN] Unknown classification: {classification}")
return "unknown"
except json.JSONDecodeError:
print(f"[WARN] Failed to parse JSON response, trying text parsing: {response}")
# Enhanced fallback to text parsing
response_upper = response.strip().upper()
# Priority order: cuneiform keywords first (most specific)
cuneiform_keywords = ["CUNEIFORM", "WEDGE", "CLAY", "MESOPOTAMIAN", "AKKADIAN", "SUMERIAN", "BABYLONIAN"]
if any(keyword in response_upper for keyword in cuneiform_keywords):
return "cuneiform"
elif "EGYPTIAN" in response_upper or "HIEROGLYPH" in response_upper:
return "egyptian"
elif "GREEK" in response_upper:
return "greek"
elif "LATIN" in response_upper or "ROMAN" in response_upper:
return "latin"
except Exception as e:
print(f"[ERROR] Response parsing failed: {e}")
return "unknown"
def classify_with_fallback(self, image_path, max_retries=2):
"""Enhanced classification with retry logic"""
for attempt in range(max_retries + 1):
try:
result = self.classify_script(image_path)
if result != "unknown":
return result
elif attempt < max_retries:
print(f"[INFO] Classification attempt {attempt + 1} returned unknown, retrying...")
continue
else:
print(f"[WARN] All classification attempts returned unknown")
return "unknown"
except Exception as e:
if attempt < max_retries:
print(f"[WARN] Classification attempt {attempt + 1} failed: {e}, retrying...")
continue
else:
print(f"[ERROR] All classification attempts failed: {e}")
return "unknown"
return "unknown"
def get_supported_scripts(self):
"""Get list of supported script types"""
return ["egyptian", "greek", "latin", "cuneiform"]
def validate_classification(self, script_type, confidence_threshold=0.7):
"""Validate classification result"""
supported_scripts = self.get_supported_scripts()
if script_type not in supported_scripts:
print(f"[WARN] Unsupported script type: {script_type}")
return False
# All classifications from Llama Vision are considered valid
return True
def get_model_info(self):
"""Get information about the vision model being used"""
return {
"model": self.vision_model,
"provider": "Groq",
"supported_scripts": self.get_supported_scripts(),
"features": [
"Ancient script classification",
"Multi-script support",
"Cuneiform wedge detection",
"Clay tablet recognition",
"High-resolution image processing"
]
}
def debug_classification(self, image_path, save_debug_info=False):
"""Debug classification with detailed information"""
try:
print(f"[DEBUG] Starting classification for: {image_path}")
# Check image properties
image = Image.open(image_path)
print(f"[DEBUG] Image size: {image.size}")
print(f"[DEBUG] Image mode: {image.mode}")
# Get base64 size
base64_image = self._image_to_base64(image_path)
if base64_image:
print(f"[DEBUG] Base64 size: {len(base64_image)} characters")
# Get raw response
response = self._query_groq_vision(base64_image)
print(f"[DEBUG] Raw API response: {response}")
# Parse and return
result = self._parse_classification_response(response)
print(f"[DEBUG] Final classification: {result}")
if save_debug_info:
debug_info = {
"image_path": image_path,
"image_size": image.size,
"base64_length": len(base64_image) if base64_image else 0,
"raw_response": response,
"classification": result
}
debug_file = f"debug_classification_{result}_{hash(image_path) % 10000}.json"
with open(debug_file, 'w') as f:
json.dump(debug_info, f, indent=2)
print(f"[DEBUG] Debug info saved to: {debug_file}")
return result
except Exception as e:
print(f"[ERROR] Debug classification failed: {e}")
return "unknown"