import gradio as gr import torch from unsloth import FastLanguageModel import spaces # Ye ZeroGPU ke liye zaroori hai model_id = "anupbth1/Ved-Code-7B" # Model ko 4-bit mein load karna (RAM bachane ke liye) model, tokenizer = FastLanguageModel.from_pretrained( model_name = model_id, max_seq_length = 2048, load_in_4bit = True, ) FastLanguageModel.for_inference(model) # ZeroGPU access ke liye decorator @spaces.GPU(duration=60) def generate(message, history): messages = [{"role": "user", "content": message}] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt = True, return_tensors = "pt" ).to("cuda") outputs = model.generate(**inputs, max_new_tokens=512) return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True) # UI Layout gr.ChatInterface( generate, title="Ved-Code-7B 🚀", description="Custom Coding Assistant by anupbth1" ).launch()