import base64 import os from openai import OpenAI import google.generativeai as genai from typing import List, Dict, Optional, Tuple import json class ModelConfig: """Configuration loader for API keys and model preferences.""" def __init__(self, config_file: str = "config.env"): self.config_file = config_file self.config = self._load_config() def _load_config(self) -> Dict: """Load configuration from file and environment variables.""" config = { 'nvidia_api_key': os.getenv('NVIDIA_API_KEY'), 'gemini_api_key': os.getenv('GEMINI_API_KEY'), 'nvidia_models': [], 'nvidia_vision_models': [], 'nvidia_text_models': [], 'gemini_models': [], 'max_tokens': 500, 'temperature': 0.2, 'request_timeout': 30, 'max_retries': 2, 'enable_streaming': True, 'verbose_logging': True, 'enable_gemini_fallback': True } # Load from config file if it exists if os.path.exists(self.config_file): with open(self.config_file, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: key, value = line.split('=', 1) key = key.strip() value = value.strip() if key == 'NVIDIA_API_KEY' and not config['nvidia_api_key']: config['nvidia_api_key'] = value elif key == 'GEMINI_API_KEY' and not config['gemini_api_key']: config['gemini_api_key'] = value elif key == 'NVIDIA_MODELS': config['nvidia_models'] = [m.strip() for m in value.split(',') if m.strip()] elif key == 'NVIDIA_VISION_MODELS': config['nvidia_vision_models'] = [m.strip() for m in value.split(',') if m.strip()] elif key == 'NVIDIA_TEXT_MODELS': config['nvidia_text_models'] = [m.strip() for m in value.split(',') if m.strip()] elif key == 'GEMINI_MODELS': config['gemini_models'] = [m.strip() for m in value.split(',') if m.strip()] elif key == 'MAX_TOKENS': config['max_tokens'] = int(value) elif key == 'TEMPERATURE': config['temperature'] = float(value) elif key == 'REQUEST_TIMEOUT': config['request_timeout'] = int(value) elif key == 'MAX_RETRIES': config['max_retries'] = int(value) elif key == 'ENABLE_STREAMING': config['enable_streaming'] = value.lower() == 'true' elif key == 'VERBOSE_LOGGING': config['verbose_logging'] = value.lower() == 'true' elif key == 'ENABLE_GEMINI_FALLBACK': config['enable_gemini_fallback'] = value.lower() == 'true' return config def get(self, key: str, default=None): """Get configuration value.""" return self.config.get(key, default) class ImageAnalyzer: """Multi-model image analyzer with cascading fallback support.""" def __init__(self, config_file: str = "config.env"): self.config = ModelConfig(config_file) self.verbose = self.config.get('verbose_logging', True) # Initialize NVIDIA client nvidia_key = self.config.get('nvidia_api_key') if nvidia_key: self.nvidia_client = OpenAI( base_url="https://integrate.api.nvidia.com/v1", api_key=nvidia_key ) else: self.nvidia_client = None self._log("āš ļø NVIDIA API key not found", force=True) # Initialize Gemini gemini_key = self.config.get('gemini_api_key') if gemini_key and self.config.get('enable_gemini_fallback'): genai.configure(api_key=gemini_key) self.gemini_enabled = True else: self.gemini_enabled = False if self.config.get('enable_gemini_fallback'): self._log("āš ļø Gemini API key not found", force=True) def _log(self, message: str, force: bool = False): """Log message if verbose logging is enabled.""" if self.verbose or force: print(message) def encode_image(self, image_path: str) -> str: """Encode image to base64 string.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def _try_nvidia_model(self, model_name: str, base64_image: str, prompt: str) -> Tuple[bool, Optional[str]]: """ Try to analyze image with a specific NVIDIA model. Returns: Tuple of (success: bool, response: Optional[str]) """ if not self.nvidia_client: return False, None self._log(f"\n{'='*60}") self._log(f"šŸ”„ Attempting NVIDIA Model: {model_name}") self._log(f"{'='*60}") try: completion = self.nvidia_client.chat.completions.create( model=model_name, messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" } } ] } ], max_tokens=self.config.get('max_tokens', 500), temperature=self.config.get('temperature', 0.2), stream=self.config.get('enable_streaming', True) ) self._log(f"\nāœ… Model Response:\n" + "-"*60) response_text = "" if self.config.get('enable_streaming', True): for chunk in completion: content = chunk.choices[0].delta.content if content is not None: response_text += content self._log(content, force=False) else: response_text = completion.choices[0].message.content self._log(response_text) self._log("\n" + "-"*60) return True, response_text except Exception as e: self._log(f"\nāŒ Model Failed: {e}") return False, None def _try_gemini_model(self, model_name: str, base64_image: str, prompt: str) -> Tuple[bool, Optional[str]]: """ Try to analyze image with a specific Gemini model. Returns: Tuple of (success: bool, response: Optional[str]) """ if not self.gemini_enabled: return False, None self._log(f"\n{'='*60}") self._log(f"šŸ”„ Attempting Gemini Model: {model_name}") self._log(f"{'='*60}") try: model = genai.GenerativeModel(model_name) # Decode base64 to bytes for Gemini image_bytes = base64.b64decode(base64_image) # Create image part image_part = { 'mime_type': 'image/png', 'data': image_bytes } response = model.generate_content([prompt, image_part]) response_text = response.text self._log(f"\nāœ… Model Response:\n" + "-"*60) self._log(response_text) self._log("\n" + "-"*60) return True, response_text except Exception as e: self._log(f"\nāŒ Model Failed: {e}") return False, None def analyze_image(self, image_path: str, prompt: str = "Please summarize what you see in this image.") -> Dict: """ Analyze image with cascading fallback across multiple models. Args: image_path: Path to the image file prompt: Analysis prompt Returns: Dictionary with analysis results and metadata """ # Validate image exists if not os.path.exists(image_path): return { 'success': False, 'error': f"Image not found: {image_path}", 'model_used': None, 'response': None } self._log(f"šŸ“ø Processing {image_path}...", force=True) # Encode image try: base64_image = self.encode_image(image_path) except Exception as e: return { 'success': False, 'error': f"Error encoding image: {e}", 'model_used': None, 'response': None } # Try NVIDIA models first nvidia_models = self.config.get('nvidia_models', []) for model_name in nvidia_models: success, response = self._try_nvidia_model(model_name, base64_image, prompt) if success: return { 'success': True, 'error': None, 'model_used': model_name, 'provider': 'NVIDIA', 'response': response } # Try Gemini models as fallback if self.config.get('enable_gemini_fallback', True): self._log(f"\nāš ļø All NVIDIA models failed. Trying Gemini fallback...", force=True) gemini_models = self.config.get('gemini_models', []) for model_name in gemini_models: success, response = self._try_gemini_model(model_name, base64_image, prompt) if success: return { 'success': True, 'error': None, 'model_used': model_name, 'provider': 'Gemini', 'response': response } # All models failed return { 'success': False, 'error': 'All models failed to analyze the image', 'model_used': None, 'provider': None, 'response': None, 'suggestions': [ 'Check your API key validity', 'Verify your internet connection', 'Ensure the image format is supported', 'Check API service status' ] } def main(): """Main function for standalone testing.""" IMAGE_PATH = "image.png" # Initialize analyzer analyzer = ImageAnalyzer() # Analyze image result = analyzer.analyze_image(IMAGE_PATH) # Print results print(f"\n{'='*60}") print(f"šŸ“Š ANALYSIS RESULTS") print(f"{'='*60}") print(f"Success: {result['success']}") if result['success']: print(f"Provider: {result['provider']}") print(f"Model Used: {result['model_used']}") print(f"\nāœ… Image analysis completed successfully!") else: print(f"Error: {result['error']}") if 'suggestions' in result: print(f"\nšŸ’” Suggestions:") for suggestion in result['suggestions']: print(f" - {suggestion}") if __name__ == "__main__": main()