AliA1997
Integrated multi-agent workflow from llama index.
5dde853
raw
history blame
2.07 kB
import requests
import os
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer
from llama_index.core.tools import FunctionTool
hf_token = os.environ.get("HF_TOKEN")
# Load processor and model once (outside the function for efficiency)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
math_model_id = "Qwen/Qwen2.5-Math-1.5B"
math_tokenizer = AutoTokenizer.from_pretrained(math_model_id, use_auth_token=hf_token)
math_model = AutoModelForCausalLM.from_pretrained(
math_model_id,
dtype=torch.float16,
device_map="auto",
use_auth_token=hf_token
)
def math_tool_func(problem: str) -> str:
inputs = math_tokenizer(problem, return_tensors="pt").to(math_model.device)
outputs = math_model.generate(**inputs, max_new_tokens=128)
result = math_tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
def init_image_to_text(img_url: str) -> dict:
"""
Convert an image URL into text captions using BLIP.
Args:
img_url (str): URL of the image to caption.
Returns:
dict: Contains both conditional and unconditional captions.
"""
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
# Conditional captioning
conditional_prompt = "a photography of"
inputs_cond = processor(raw_image, conditional_prompt, return_tensors="pt")
out_cond = model.generate(**inputs_cond)
conditional_caption = processor.decode(out_cond[0], skip_special_tokens=True)
# Unconditional captioning
inputs_uncond = processor(raw_image, return_tensors="pt")
out_uncond = model.generate(**inputs_uncond)
unconditional_caption = processor.decode(out_uncond[0], skip_special_tokens=True)
return {
"conditional_caption": conditional_caption,
"unconditional_caption": unconditional_caption,
}