import torch from transformers import AutoProcessor, AutoModelForVision2Seq from huggingface_hub import login import gradio as gr import os import gc # ---------------------------- # AUTHENTICATION # ---------------------------- HF_TOKEN = os.getenv("HF_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) else: print("No HF_TOKEN found. Please log in manually.") login() # ---------------------------- # CONFIG # ---------------------------- MODEL_NAME = "reverseforward/inferencemodel" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 # Clear cache before loading gc.collect() if DEVICE == "cuda": torch.cuda.empty_cache() # ---------------------------- # LOAD MODEL (with error handling) # ---------------------------- print(f"Loading model on {DEVICE}...") try: model = AutoModelForVision2Seq.from_pretrained( MODEL_NAME, torch_dtype=DTYPE, device_map="auto", token=HF_TOKEN, low_cpu_mem_usage=True, # Reduce memory usage ) processor = AutoProcessor.from_pretrained( MODEL_NAME, token=HF_TOKEN, ) print("✓ Model loaded successfully.") except Exception as e: print(f"✗ Error loading model: {e}") raise # ---------------------------- # INFERENCE FUNCTION # ---------------------------- def chat_with_image(image, text): try: if image is None or text.strip() == "": return "Please provide both an image and text input." # Clear memory before inference gc.collect() if DEVICE == "cuda": torch.cuda.empty_cache() # Prepare inputs inputs = processor( text=[text], images=[image], return_tensors="pt" ).to(DEVICE, DTYPE) # Generate output with torch.inference_mode(): generated_ids = model.generate( **inputs, max_new_tokens=256, temperature=0.7, do_sample=True, ) output = processor.batch_decode( generated_ids, skip_special_tokens=True )[0] # Clean up del inputs, generated_ids gc.collect() return output.strip() except Exception as e: return f"Error during inference: {str(e)}" # ---------------------------- # GRADIO UI # ---------------------------- title = "🧠 Qwen3-VL-8B Fine-tuned (Image + Text)" description = """ Upload an image and enter a text prompt. The model will reason visually and respond. """ demo = gr.Interface( fn=chat_with_image, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(label="Enter Instruction or Question", lines=3), ], outputs=gr.Textbox(label="Model Output", lines=5), title=title, description=description, allow_flagging="never", # Disable flagging to reduce overhead ) if __name__ == "__main__": demo.launch(show_error=True)