import requests import os import torch from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration from transformers import AutoModelForCausalLM, AutoTokenizer from llama_index.core.tools import FunctionTool hf_token = os.environ.get("HF_TOKEN") # Load processor and model once (outside the function for efficiency) processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") math_model_id = "Qwen/Qwen2.5-Math-1.5B" math_tokenizer = AutoTokenizer.from_pretrained(math_model_id, use_auth_token=hf_token) math_model = AutoModelForCausalLM.from_pretrained( math_model_id, dtype=torch.float16, device_map="auto", use_auth_token=hf_token ) def math_tool_func(problem: str) -> str: inputs = math_tokenizer(problem, return_tensors="pt").to(math_model.device) outputs = math_model.generate(**inputs, max_new_tokens=128) result = math_tokenizer.decode(outputs[0], skip_special_tokens=True) return result def init_image_to_text(img_url: str) -> dict: """ Convert an image URL into text captions using BLIP. Args: img_url (str): URL of the image to caption. Returns: dict: Contains both conditional and unconditional captions. """ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") # Conditional captioning conditional_prompt = "a photography of" inputs_cond = processor(raw_image, conditional_prompt, return_tensors="pt") out_cond = model.generate(**inputs_cond) conditional_caption = processor.decode(out_cond[0], skip_special_tokens=True) # Unconditional captioning inputs_uncond = processor(raw_image, return_tensors="pt") out_uncond = model.generate(**inputs_uncond) unconditional_caption = processor.decode(out_uncond[0], skip_special_tokens=True) return { "conditional_caption": conditional_caption, "unconditional_caption": unconditional_caption, }