from consts import REASONING_START, REASONING_END, SOLUTION_START, SOLUTION_END from transformers import TextStreamer from unsloth import FastVisionModel def inference(idx: int, model, dataset, tokenizer): FastVisionModel.for_inference(model) image = dataset[idx]["decoded_image"] instruction = ( f"{dataset[idx]["question"]}, provide your reasoning between {REASONING_START} and {REASONING_END} " f"and then your final answer between {SOLUTION_START} and (put a float here) {SOLUTION_END}" ) messages = [ { "role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}], } ] input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) inputs = tokenizer( image, input_text, add_special_tokens=False, return_tensors="pt", ).to("cuda") text_streamer = TextStreamer(tokenizer, skip_prompt=True) result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, use_cache=True, temperature = 1.0, top_p = 0.95, top_k = 64) return result