File size: 11,087 Bytes
9883bdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3681c
 
 
 
 
 
9883bdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46f2212
 
9883bdb
 
 
46f2212
9883bdb
 
 
 
 
 
 
 
 
 
 
 
3a3681c
9883bdb
46f2212
9883bdb
 
46f2212
9883bdb
46f2212
 
 
 
4fc1a2f
9883bdb
46f2212
 
 
 
 
9883bdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import base64
from openai import OpenAI
from PIL import Image
from io import BytesIO
import os

def resize_image(image_input, max_size=1024, preserve_aspect_ratio=True):
    """
    Resize an image while preserving aspect ratio
    
    Args:
        image_input: Either a PIL Image object or a string file path to an image
        max_size (int): Maximum width or height (whichever is larger)
        preserve_aspect_ratio (bool): Whether to preserve the aspect ratio
        
    Returns:
        PIL.Image: Resized image
    """
    # Load the image if it's a file path
    if isinstance(image_input, str):
        if not os.path.exists(image_input):
            raise FileNotFoundError(f"Image file not found: {image_input}")
        img = Image.open(image_input)
    else:
        # Already a PIL Image
        img = image_input
        
    # Get original dimensions
    width, height = img.size
    
    # Skip if already smaller than max_size
    if width <= max_size and height <= max_size:
        return img
    
    if preserve_aspect_ratio:
        # Calculate the ratio
        if width > height:
            new_width = max_size
            new_height = int(height * (max_size / width))
        else:
            new_height = max_size
            new_width = int(width * (max_size / height))
    else:
        new_width = max_size
        new_height = max_size
        
    # Resize the image
    resized_img = img.resize((new_width, new_height), Image.LANCZOS)
    return resized_img

def encode_image(image_input, format="JPEG", max_size=None):
    """
    Convert an image to a base64 encoded string, with optional resizing.
    
    Args:
        image_input: Either a PIL Image object or a string file path to an image
        format: Image format for saving (default: "JPEG")
        max_size: Maximum size for the image (width or height). If None, no resizing is done.
        
    Returns:
        base64 encoded string of the image
    """
    # Check if input is a file path (string) or PIL Image
    if isinstance(image_input, str):
        # Input is a file path
        if not os.path.exists(image_input):
            raise FileNotFoundError(f"Image file not found: {image_input}")
        
        if max_size:
            # Load, resize, and encode
            img = Image.open(image_input)
            resized_img = resize_image(img, max_size=max_size)
            buffered = BytesIO()
            resized_img.save(buffered, format=format)
            return base64.b64encode(buffered.getvalue()).decode("utf-8")
        else:
            # Read file directly without resizing
            with open(image_input, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
    else:
        # Input is a PIL Image object
        if max_size:
            image_input = resize_image(image_input, max_size=max_size)
            
        buffered = BytesIO()
        image_input.save(buffered, format=format)
        return base64.b64encode(buffered.getvalue()).decode("utf-8")

class OpenRouterAPI:
    def __init__(self, api_key=None,base_url="https://openrouter.ai/api/v1"):
        """
        Initialize the OpenRouter client
        
        Args:
            api_key (str, optional): OpenRouter API key. If None, will try to get from environment variable
        """
        api_key = api_key or os.getenv("OPENROUTER_API_KEY")
        if not api_key:
            raise ValueError("OpenRouter API key not provided and not found in environment variables")
        
        self.client = OpenAI(
            api_key=api_key,
            base_url=base_url
        )
    
    def list_models(self):
        """
        List all available models on OpenRouter
        
        Returns:
            list: List of model IDs
        """
        models = self.client.models.list()
        model_ids = [model.id for model in models.data]
        return model_ids

    def generate_caption(self, image_path, 
                     model="anthropic/claude-3-7-sonnet", 
                     prompt_dev="",
                     prompt="Give a very brief description of this image.", 
                     detail="high", 
                     temperature=0.7,
                     max_image_size=1024):
        """
        Generate captions for an image using OpenRouter models
        
        Args:
            image_path (str): Path to the image file
            model (str): Model to use (e.g., 'anthropic/claude-3-7-sonnet', 'openai/gpt-4o')
            prompt_dev (str): System prompt or developer prompt
            prompt (str): Text prompt to guide caption generation
            detail (str): Level of detail for image analysis ('low', 'high', etc.) - only applies to OpenAI models
            temperature (float): Sampling temperature for generation
            max_image_size (int): Maximum dimension of the image before encoding. Set to None to disable resizing.
            
        Returns:
            str: Generated caption
        """
        # Getting the Base64 string with optional resizing
        base64_image = encode_image(image_path, max_size=max_image_size)
        
        # Prepare messages based on OpenRouter's format
        messages = []
        
        # Add system message if prompt_dev is provided
        if prompt_dev:
            messages.append({
                "role": "system",
                "content": prompt_dev
            })
        
        # Add user message with text and image
        content = [
            {
                "type": "text",
                "text": prompt,
            }
        ]
        
        content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
            })
            
        messages.append({
            "role": "user",
            "content": content,
            "provider": {
                "data_collection": "deny"
            }
        })
        
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature
        )
        
        return response.choices[0].message.content
    
    def generate_text_response(self, text_prompt, 
                              model="anthropic/claude-3-5-haiku", 
                              prompt_dev="", 
                              temperature=0.7):
        """
        Generate responses based on text input using OpenRouter models
        
        Args:
            text_prompt (str): The text to analyze or respond to
            model (str): Model to use (e.g., 'anthropic/claude-3-5-haiku', 'openai/gpt-4o-mini', 'google/gemini-pro')
            prompt_dev (str): System prompt or developer prompt
            temperature (float): Sampling temperature for generation
            
        Returns:
            str: Generated response
        """
        # Prepare messages based on OpenRouter's format
        messages = []
        
        # Add system message if prompt_dev is provided
        if prompt_dev:
            messages.append({
                "role": "system",
                "content": prompt_dev
            })
        
        # Add user message with text
        messages.append({
            "role": "user",
            "content": text_prompt
        })
        
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
        )
        
        return response.choices[0].message.content

    def classify_objs(self, image_path, 
                     categories=["Painting/sketch", "Statue/Bust", "Clothing", "Porcelain/Ceramic tableware", "Text based Document", "Other"],
                     model="openai/gpt-4o-mini", 
                     detail="low",
                     max_image_size=512):  # Using smaller size for classification since less detail is needed
        """
        Classify objects in an image into predefined categories
        
        Args:
            image_path (str): Path to the image file
            categories (list): List of categories for classification
            model (str): Model to use for classification
            detail (str): Level of detail for image analysis ('low', 'high') - only applies to OpenAI models
            max_image_size (int): Maximum dimension for the image. Can be smaller for classification tasks.
            
        Returns:
            str: Classification result
        """
        prompt = f"This is an image of a museum object. Classify it into one of these categories: {categories}. Only classify it if you are confident it belongs in that category and the category represents the main portion of the image, otherwise return 'Other'. Respond with only the category name."   
        return self.generate_caption(image_path, model=model, prompt=prompt, detail=detail, max_image_size=max_image_size)

    def estimate_cost(self, model, tokens_in=1000, tokens_out=200, image=False, detail="low"):
        """
        Estimate the cost of using a specific model based on input/output tokens
        
        Args:
            model (str): Model identifier
            tokens_in (int): Number of input tokens
            tokens_out (int): Number of output tokens
            image (bool): Whether the request includes an image
            detail (str): Image detail level ('low', 'high')
            
        Returns:
            dict: Cost estimate information
        """
        # This is a simplified approach - in a real implementation,
        # you might want to use OpenRouter's pricing API or maintain
        # a more complete pricing table
        
        # Simplified pricing mapping (in USD per 1M tokens)
        # These are example values - please update with actual OpenRouter pricing
        pricing = {
            "anthropic/claude-3-7-sonnet": {"input": 15.0, "output": 75.0},
            "anthropic/claude-3-5-haiku": {"input": 1.0, "output": 5.0},
            "openai/gpt-4o": {"input": 10.0, "output": 30.0}, 
            "openai/gpt-4o-mini": {"input": 0.2, "output": 0.6},
            "google/gemini-pro": {"input": 0.5, "output": 1.5},
        }
        
        # Default to a moderate pricing if model not found
        model_pricing = pricing.get(model, {"input": 5.0, "output": 15.0})
        
        # Image tokens estimation
        image_tokens = 0
        if image:
            if detail == "low":
                image_tokens = 1200
            else:  # high
                image_tokens = 4000
        
        # Calculate costs
        input_cost = (tokens_in + image_tokens) * model_pricing["input"] / 1000000
        output_cost = tokens_out * model_pricing["output"] / 1000000
        total_cost = input_cost + output_cost
        
        return {
            "model": model,
            "input_tokens": tokens_in + image_tokens,
            "output_tokens": tokens_out,
            "input_cost": input_cost,
            "output_cost": output_cost,
            "total_cost": total_cost
        }