import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel # 1. Load the Base Model and your Adapters model_id = "Qwen/Qwen2.5-Coder-7B-Instruct" # Base model adapter_id = "SALEETAI/coding-agent-qwen-sft" # Your trained adapters print("Loading model... this may take a few minutes on CPU.") tokenizer = AutoTokenizer.from_pretrained(model_id) # Load base model in 8-bit to save RAM (CPU friendly) base_model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="cpu" ) # Merge your trained SFT weights model = PeftModel.from_pretrained(base_model, adapter_id) print("Model Loaded!") def chat(message, history): # Prepare the prompt inputs = tokenizer(message, return_tensors="pt").to("cpu") # Generate with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove the prompt from the response return response.replace(message, "").strip() # Build UI demo = gr.ChatInterface(fn=chat, title="Coding Agent (CPU Mode)") if __name__ == "__main__": demo.launch()