import base64 import os from openai import OpenAI # Initialize NVIDIA Client client = OpenAI( base_url="https://integrate.api.nvidia.com/v1", api_key=os.getenv('GEMINI_API_KEY_1') ) # Model configurations PRIMARY_MODEL = "meta/llama-3.2-90b-vision-instruct" FALLBACK_MODEL = "meta/llama-3.1-70b-instruct" # Text-only fallback model IMAGE_PATH = "image.png" def encode_image(image_path): """Encode image to base64 string.""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def summarize_with_vision_model(base64_image): """ Attempt to summarize image using vision model. Args: base64_image: Base64 encoded image string Returns: str: Summary text or None if failed """ try: print(f"šŸ” Attempting with primary vision model: {PRIMARY_MODEL}...") completion = client.chat.completions.create( model=PRIMARY_MODEL, messages=[ { "role": "user", "content": [ {"type": "text", "text": "Please summarize what you see in this image."}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" } } ] } ], max_tokens=500, temperature=0.2, stream=True ) print("\nāœ… Image Summary (Vision Model):\n" + "-" * 50) summary = "" for chunk in completion: content = chunk.choices[0].delta.content if content is not None: print(content, end="", flush=True) summary += content print("\n" + "-" * 50) return summary except Exception as e: print(f"\nāš ļø Vision model failed: {e}") return None def summarize_with_text_fallback(): """ Fallback method using text-only LLM. Provides a generic response when vision model fails. Returns: str: Fallback response """ try: print(f"\nšŸ”„ Falling back to text model: {FALLBACK_MODEL}...") # Create a prompt that acknowledges the limitation prompt = """I attempted to analyze an image but the vision model is unavailable. Please provide a helpful response about what types of information can typically be extracted from images, and suggest alternative approaches for image analysis.""" completion = client.chat.completions.create( model=FALLBACK_MODEL, messages=[ { "role": "user", "content": prompt } ], max_tokens=500, temperature=0.2, stream=True ) print("\nšŸ’” Fallback Response (Text Model):\n" + "-" * 50) response = "" for chunk in completion: content = chunk.choices[0].delta.content if content is not None: print(content, end="", flush=True) response += content print("\n" + "-" * 50) return response except Exception as e: print(f"\nāŒ Fallback model also failed: {e}") return None def summarize_image(): """ Main function to summarize an image with fallback support. Attempts to use vision model first, falls back to text model if needed. """ # Check if image exists if not os.path.exists(IMAGE_PATH): print(f"āŒ Error: {IMAGE_PATH} not found.") print(f"šŸ“ Current directory: {os.getcwd()}") print(f"šŸ“‹ Files in current directory: {os.listdir('.')}") return print(f"šŸ“ø Processing {IMAGE_PATH}...") print(f"šŸ“ File size: {os.path.getsize(IMAGE_PATH)} bytes\n") # Encode the image try: base64_image = encode_image(IMAGE_PATH) except Exception as e: print(f"āŒ Error encoding image: {e}") return # Try vision model first result = summarize_with_vision_model(base64_image) # If vision model failed, use fallback if result is None: print("\nšŸ”„ Primary model failed, attempting fallback...") result = summarize_with_text_fallback() # Final status if result is None: print("\nāŒ All methods failed. Please check:") print(" 1. API key validity") print(" 2. Network connection") print(" 3. NVIDIA API service status") else: print("\nāœ… Image processing completed successfully!") if __name__ == "__main__": summarize_image()