Spaces:

Akshay30
/

decipherai-api

Sleeping

File size: 10,331 Bytes

2f4af3f

import base64
import json
import os
from io import BytesIO
from PIL import Image
from groq import Groq


class GroqVisionScriptClassifier:
    def __init__(self, groq_api_key):
        self.groq_client = Groq(api_key=groq_api_key)
        # FIXED: Use the correct stable model name
        self.vision_model = "meta-llama/llama-4-scout-17b-16e-instruct"
        print(f"[INFO] Groq Vision Classifier initialized with {self.vision_model}")
    
    def classify_script(self, image_path):
        """Enhanced script classification including cuneiform using Groq's Llama Vision model"""
        try:
            # Convert image to base64
            base64_image = self._image_to_base64(image_path)
            if not base64_image:
                return "unknown"
            
            # Query Groq Vision API
            response = self._query_groq_vision(base64_image)
            
            # Parse the response
            script_type = self._parse_classification_response(response)
            
            print(f"[INFO] Llama Vision classified script as: {script_type}")
            return script_type.lower()
            
        except Exception as e:
            print(f"[ERROR] Groq Vision script classification failed: {e}")
            return "unknown"
    
    def _image_to_base64(self, image_path):
        """Convert image to base64 for Groq Vision API (4MB limit)"""
        try:
            image = Image.open(image_path)
            
            # Resize if too large (keep under 4MB base64 limit)
            if max(image.size) > 1200:
                image.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
            
            # Convert to base64 JPEG (smaller than PNG)
            buffer = BytesIO()
            image.save(buffer, format="JPEG", quality=90)
            image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
            
            # Check size (base64 should be < 4MB)
            if len(image_b64) > 4 * 1024 * 1024:  # 4MB limit
                # Reduce quality and try again
                buffer = BytesIO()
                image.save(buffer, format="JPEG", quality=70)
                image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
            
            return image_b64
            
        except Exception as e:
            print(f"[ERROR] Image to base64 conversion failed: {e}")
            return None
    
    def _query_groq_vision(self, base64_image):
        """Enhanced query for Groq Llama Vision API including cuneiform"""
        try:
            # FIXED: Simplified prompt to avoid token limit issues
            prompt = """Analyze this image of ancient text/script as an expert paleographer.

Classify it as ONE of these ancient script types:

- EGYPTIAN: Hieroglyphic symbols (birds, eyes, human figures, cartouches)
- GREEK: Ancient/medieval Greek alphabet (α,β,γ,δ,ε,ζ,η,θ) with diacritics
- LATIN: Latin alphabet letters, Roman inscriptions, medieval manuscripts
- CUNEIFORM: Wedge-shaped impressions on clay tablets (triangular marks)

IMPORTANT: Cuneiform has geometric wedge patterns, NOT pictures like hieroglyphs.

Respond ONLY with JSON:
{"classification": "EGYPTIAN" or "GREEK" or "LATIN" or "CUNEIFORM", "confidence": 0.0-1.0}"""

            completion = self.groq_client.chat.completions.create(
                model=self.vision_model,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                temperature=0.1,  # Low temperature for consistent classification
                max_completion_tokens=100,  # FIXED: Reduced to avoid token errors
                top_p=0.9,
                stream=False,
                response_format={"type": "json_object"}
            )
            
            return completion.choices[0].message.content
            
        except Exception as e:
            print(f"[ERROR] Groq Vision API call failed: {e}")
            return None
    
    def _parse_classification_response(self, response):
        """Enhanced parsing for JSON response including cuneiform"""
        if not response:
            return "unknown"
        
        try:
            # Parse JSON response
            data = json.loads(response)
            classification = data.get('classification', '').upper()
            confidence = data.get('confidence', 0.0)
            
            print(f"[INFO] Vision model confidence: {confidence:.3f}")
            
            # Enhanced classification mapping including cuneiform
            if classification == "EGYPTIAN":
                return "egyptian"
            elif classification == "GREEK":
                return "greek"
            elif classification == "LATIN":
                return "latin"
            elif classification == "CUNEIFORM":
                return "cuneiform"
            else:
                print(f"[WARN] Unknown classification: {classification}")
                return "unknown"
                
        except json.JSONDecodeError:
            print(f"[WARN] Failed to parse JSON response, trying text parsing: {response}")
            # Enhanced fallback to text parsing
            response_upper = response.strip().upper()
            
            # Priority order: cuneiform keywords first (most specific)
            cuneiform_keywords = ["CUNEIFORM", "WEDGE", "CLAY", "MESOPOTAMIAN", "AKKADIAN", "SUMERIAN", "BABYLONIAN"]
            if any(keyword in response_upper for keyword in cuneiform_keywords):
                return "cuneiform"
            elif "EGYPTIAN" in response_upper or "HIEROGLYPH" in response_upper:
                return "egyptian"
            elif "GREEK" in response_upper:
                return "greek"
            elif "LATIN" in response_upper or "ROMAN" in response_upper:
                return "latin"
        
        except Exception as e:
            print(f"[ERROR] Response parsing failed: {e}")
        
        return "unknown"
    
    def classify_with_fallback(self, image_path, max_retries=2):
        """Enhanced classification with retry logic"""
        for attempt in range(max_retries + 1):
            try:
                result = self.classify_script(image_path)
                
                if result != "unknown":
                    return result
                elif attempt < max_retries:
                    print(f"[INFO] Classification attempt {attempt + 1} returned unknown, retrying...")
                    continue
                else:
                    print(f"[WARN] All classification attempts returned unknown")
                    return "unknown"
                    
            except Exception as e:
                if attempt < max_retries:
                    print(f"[WARN] Classification attempt {attempt + 1} failed: {e}, retrying...")
                    continue
                else:
                    print(f"[ERROR] All classification attempts failed: {e}")
                    return "unknown"
        
        return "unknown"
    
    def get_supported_scripts(self):
        """Get list of supported script types"""
        return ["egyptian", "greek", "latin", "cuneiform"]
    
    def validate_classification(self, script_type, confidence_threshold=0.7):
        """Validate classification result"""
        supported_scripts = self.get_supported_scripts()
        
        if script_type not in supported_scripts:
            print(f"[WARN] Unsupported script type: {script_type}")
            return False
        
        # All classifications from Llama Vision are considered valid
        return True
    
    def get_model_info(self):
        """Get information about the vision model being used"""
        return {
            "model": self.vision_model,
            "provider": "Groq",
            "supported_scripts": self.get_supported_scripts(),
            "features": [
                "Ancient script classification",
                "Multi-script support", 
                "Cuneiform wedge detection",
                "Clay tablet recognition",
                "High-resolution image processing"
            ]
        }

    def debug_classification(self, image_path, save_debug_info=False):
        """Debug classification with detailed information"""
        try:
            print(f"[DEBUG] Starting classification for: {image_path}")
            
            # Check image properties
            image = Image.open(image_path)
            print(f"[DEBUG] Image size: {image.size}")
            print(f"[DEBUG] Image mode: {image.mode}")
            
            # Get base64 size
            base64_image = self._image_to_base64(image_path)
            if base64_image:
                print(f"[DEBUG] Base64 size: {len(base64_image)} characters")
            
            # Get raw response
            response = self._query_groq_vision(base64_image)
            print(f"[DEBUG] Raw API response: {response}")
            
            # Parse and return
            result = self._parse_classification_response(response)
            print(f"[DEBUG] Final classification: {result}")
            
            if save_debug_info:
                debug_info = {
                    "image_path": image_path,
                    "image_size": image.size,
                    "base64_length": len(base64_image) if base64_image else 0,
                    "raw_response": response,
                    "classification": result
                }
                
                debug_file = f"debug_classification_{result}_{hash(image_path) % 10000}.json"
                with open(debug_file, 'w') as f:
                    json.dump(debug_info, f, indent=2)
                print(f"[DEBUG] Debug info saved to: {debug_file}")
            
            return result
            
        except Exception as e:
            print(f"[ERROR] Debug classification failed: {e}")
            return "unknown"