import base64 from openai import OpenAI from PIL import Image from io import BytesIO import os def resize_image(image_input, max_size=1024, preserve_aspect_ratio=True): """ Resize an image while preserving aspect ratio Args: image_input: Either a PIL Image object or a string file path to an image max_size (int): Maximum width or height (whichever is larger) preserve_aspect_ratio (bool): Whether to preserve the aspect ratio Returns: PIL.Image: Resized image """ # Load the image if it's a file path if isinstance(image_input, str): if not os.path.exists(image_input): raise FileNotFoundError(f"Image file not found: {image_input}") img = Image.open(image_input) else: # Already a PIL Image img = image_input # Get original dimensions width, height = img.size # Skip if already smaller than max_size if width <= max_size and height <= max_size: return img if preserve_aspect_ratio: # Calculate the ratio if width > height: new_width = max_size new_height = int(height * (max_size / width)) else: new_height = max_size new_width = int(width * (max_size / height)) else: new_width = max_size new_height = max_size # Resize the image resized_img = img.resize((new_width, new_height), Image.LANCZOS) return resized_img def encode_image(image_input, format="JPEG", max_size=None): """ Convert an image to a base64 encoded string, with optional resizing. Args: image_input: Either a PIL Image object or a string file path to an image format: Image format for saving (default: "JPEG") max_size: Maximum size for the image (width or height). If None, no resizing is done. Returns: base64 encoded string of the image """ # Check if input is a file path (string) or PIL Image if isinstance(image_input, str): # Input is a file path if not os.path.exists(image_input): raise FileNotFoundError(f"Image file not found: {image_input}") if max_size: # Load, resize, and encode img = Image.open(image_input) resized_img = resize_image(img, max_size=max_size) buffered = BytesIO() resized_img.save(buffered, format=format) return base64.b64encode(buffered.getvalue()).decode("utf-8") else: # Read file directly without resizing with open(image_input, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") else: # Input is a PIL Image object if max_size: image_input = resize_image(image_input, max_size=max_size) buffered = BytesIO() image_input.save(buffered, format=format) return base64.b64encode(buffered.getvalue()).decode("utf-8") class OpenRouterAPI: def __init__(self, api_key=None,base_url="https://openrouter.ai/api/v1"): """ Initialize the OpenRouter client Args: api_key (str, optional): OpenRouter API key. If None, will try to get from environment variable """ api_key = api_key or os.getenv("OPENROUTER_API_KEY") if not api_key: raise ValueError("OpenRouter API key not provided and not found in environment variables") self.client = OpenAI( api_key=api_key, base_url=base_url ) def list_models(self): """ List all available models on OpenRouter Returns: list: List of model IDs """ models = self.client.models.list() model_ids = [model.id for model in models.data] return model_ids def generate_caption(self, image_path, model="anthropic/claude-3-7-sonnet", prompt_dev="", prompt="Give a very brief description of this image.", detail="high", temperature=0.7, max_image_size=1024): """ Generate captions for an image using OpenRouter models Args: image_path (str): Path to the image file model (str): Model to use (e.g., 'anthropic/claude-3-7-sonnet', 'openai/gpt-4o') prompt_dev (str): System prompt or developer prompt prompt (str): Text prompt to guide caption generation detail (str): Level of detail for image analysis ('low', 'high', etc.) - only applies to OpenAI models temperature (float): Sampling temperature for generation max_image_size (int): Maximum dimension of the image before encoding. Set to None to disable resizing. Returns: str: Generated caption """ # Getting the Base64 string with optional resizing base64_image = encode_image(image_path, max_size=max_image_size) # Prepare messages based on OpenRouter's format messages = [] # Add system message if prompt_dev is provided if prompt_dev: messages.append({ "role": "system", "content": prompt_dev }) # Add user message with text and image content = [ { "type": "text", "text": prompt, } ] content.append({ "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, }) messages.append({ "role": "user", "content": content, "provider": { "data_collection": "deny" } }) response = self.client.chat.completions.create( model=model, messages=messages, temperature=temperature ) return response.choices[0].message.content def generate_text_response(self, text_prompt, model="anthropic/claude-3-5-haiku", prompt_dev="", temperature=0.7): """ Generate responses based on text input using OpenRouter models Args: text_prompt (str): The text to analyze or respond to model (str): Model to use (e.g., 'anthropic/claude-3-5-haiku', 'openai/gpt-4o-mini', 'google/gemini-pro') prompt_dev (str): System prompt or developer prompt temperature (float): Sampling temperature for generation Returns: str: Generated response """ # Prepare messages based on OpenRouter's format messages = [] # Add system message if prompt_dev is provided if prompt_dev: messages.append({ "role": "system", "content": prompt_dev }) # Add user message with text messages.append({ "role": "user", "content": text_prompt }) response = self.client.chat.completions.create( model=model, messages=messages, temperature=temperature, ) return response.choices[0].message.content def classify_objs(self, image_path, categories=["Painting/sketch", "Statue/Bust", "Clothing", "Porcelain/Ceramic tableware", "Text based Document", "Other"], model="openai/gpt-4o-mini", detail="low", max_image_size=512): # Using smaller size for classification since less detail is needed """ Classify objects in an image into predefined categories Args: image_path (str): Path to the image file categories (list): List of categories for classification model (str): Model to use for classification detail (str): Level of detail for image analysis ('low', 'high') - only applies to OpenAI models max_image_size (int): Maximum dimension for the image. Can be smaller for classification tasks. Returns: str: Classification result """ prompt = f"This is an image of a museum object. Classify it into one of these categories: {categories}. Only classify it if you are confident it belongs in that category and the category represents the main portion of the image, otherwise return 'Other'. Respond with only the category name." return self.generate_caption(image_path, model=model, prompt=prompt, detail=detail, max_image_size=max_image_size) def estimate_cost(self, model, tokens_in=1000, tokens_out=200, image=False, detail="low"): """ Estimate the cost of using a specific model based on input/output tokens Args: model (str): Model identifier tokens_in (int): Number of input tokens tokens_out (int): Number of output tokens image (bool): Whether the request includes an image detail (str): Image detail level ('low', 'high') Returns: dict: Cost estimate information """ # This is a simplified approach - in a real implementation, # you might want to use OpenRouter's pricing API or maintain # a more complete pricing table # Simplified pricing mapping (in USD per 1M tokens) # These are example values - please update with actual OpenRouter pricing pricing = { "anthropic/claude-3-7-sonnet": {"input": 15.0, "output": 75.0}, "anthropic/claude-3-5-haiku": {"input": 1.0, "output": 5.0}, "openai/gpt-4o": {"input": 10.0, "output": 30.0}, "openai/gpt-4o-mini": {"input": 0.2, "output": 0.6}, "google/gemini-pro": {"input": 0.5, "output": 1.5}, } # Default to a moderate pricing if model not found model_pricing = pricing.get(model, {"input": 5.0, "output": 15.0}) # Image tokens estimation image_tokens = 0 if image: if detail == "low": image_tokens = 1200 else: # high image_tokens = 4000 # Calculate costs input_cost = (tokens_in + image_tokens) * model_pricing["input"] / 1000000 output_cost = tokens_out * model_pricing["output"] / 1000000 total_cost = input_cost + output_cost return { "model": model, "input_tokens": tokens_in + image_tokens, "output_tokens": tokens_out, "input_cost": input_cost, "output_cost": output_cost, "total_cost": total_cost }