Final_Assignment_Template / vision_llm.py
silasyl's picture
Initial commit with LFS-tracked files
ecbc0b3
Raw
History Blame Contribute Delete
2.34 kB
import io
import base64
import os
import requests
from PIL import Image
from smolagents import tool, OpenAIServerModel
from tools import get_file_content
def encode_image(image_bytes: bytes, new_size=512):
# Resize image to upper 512 pixels and return in base64 format
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
original_width, original_height = image.size
if original_width > original_height:
ratio = new_size / original_width
else:
ratio = new_size / original_height
new_width = int(original_width * ratio)
new_height = int(original_height * ratio)
resized_image = image.resize((new_width, new_height))
buffered = io.BytesIO()
resized_image.save(buffered, format='JPEG')
return base64.b64encode(buffered.getvalue()).decode('utf-8')
def download_image(task_id: str, api_url: str) -> None:
# Downloads an image file and encode it in base64 format
#questions_files = f"{api_url}/files"
#response = requests.get(f"{questions_files}/{task_id}", timeout=15)
response = get_file_content(task_id, api_url)
encoded_image = encode_image(response.content)
return encoded_image
@tool
def call_vision_llm(user_query: str, file_id: str, file_url: str) -> str:
"""
Downloads the image using the file_id and file_url, then analyzes it using a vision-based LLM, following user query.
Args:
user_query: User request on image.
file_id: metadata required to download the image.
file_url: metadata required to download the image.
"""
encoded_image = download_image(file_id, file_url)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
vision_model = OpenAIServerModel(
api_key=OPENAI_API_KEY,
model_id='gpt-4o-mini',
temperature=0,
)
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": user_query,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}",
"detail": "low"
}
}
]
}
]
response = vision_model(messages).content
return response