Spaces:
Sleeping
Sleeping
| import requests | |
| import os | |
| import torch | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llama_index.core.tools import FunctionTool | |
| hf_token = os.environ.get("HF_TOKEN") | |
| # Load processor and model once (outside the function for efficiency) | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") | |
| math_model_id = "Qwen/Qwen2.5-Math-1.5B" | |
| math_tokenizer = AutoTokenizer.from_pretrained(math_model_id, use_auth_token=hf_token) | |
| math_model = AutoModelForCausalLM.from_pretrained( | |
| math_model_id, | |
| dtype=torch.float16, | |
| device_map="auto", | |
| use_auth_token=hf_token | |
| ) | |
| def math_tool_func(problem: str) -> str: | |
| inputs = math_tokenizer(problem, return_tensors="pt").to(math_model.device) | |
| outputs = math_model.generate(**inputs, max_new_tokens=128) | |
| result = math_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return result | |
| def init_image_to_text(img_url: str) -> dict: | |
| """ | |
| Convert an image URL into text captions using BLIP. | |
| Args: | |
| img_url (str): URL of the image to caption. | |
| Returns: | |
| dict: Contains both conditional and unconditional captions. | |
| """ | |
| raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") | |
| # Conditional captioning | |
| conditional_prompt = "a photography of" | |
| inputs_cond = processor(raw_image, conditional_prompt, return_tensors="pt") | |
| out_cond = model.generate(**inputs_cond) | |
| conditional_caption = processor.decode(out_cond[0], skip_special_tokens=True) | |
| # Unconditional captioning | |
| inputs_uncond = processor(raw_image, return_tensors="pt") | |
| out_uncond = model.generate(**inputs_uncond) | |
| unconditional_caption = processor.decode(out_uncond[0], skip_special_tokens=True) | |
| return { | |
| "conditional_caption": conditional_caption, | |
| "unconditional_caption": unconditional_caption, | |
| } |