from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small") def reason(objects, caption, question): prompt = f""" You are a visual reasoning system. Use ONLY the given objects and scene. Do NOT invent new events or actions. If an action is visible, describe it. If no clear action is visible, describe the scene simply. Example: Objects: person, dog Scene: a man walking a dog on a path Question: What is happening in this image? Answer: A person is walking a dog outdoors. Objects: car Scene: a car on a race track Question: What is happening in this image? Answer: A car is driving on a race track. Now answer: Objects: {objects} Scene: {caption} Question: {question} Answer: """ inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=40 ) raw = tokenizer.decode(outputs[0], skip_special_tokens=True) answer = raw.split("Answer:")[-1].strip() # remove accidental extra parts answer = answer.split("\n")[0] return answer