# app.py - ZeroGPU compatible version (NO Unsloth) import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel from huggingface_hub import spaces # ← important! # Your model paths BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" print("Loading model on CPU first... (will use GPU only during @spaces.GPU)") bnb_config = BitsAndBytesConfig(load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) model = PeftModel.from_pretrained(model, LORA_PATH) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) model.eval() @spaces.GPU # ← this requests GPU slice only during this function def generate_sql(prompt: str): messages = [{"role": "user", "content": prompt}] inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") with torch.inference_mode(): outputs = model.generate( inputs, max_new_tokens=180, temperature=0.0, do_sample=False, use_cache=True, pad_token_id=tokenizer.eos_token_id, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) if "<|assistant|>" in response: response = response.split("<|assistant|>", 1)[-1].strip() return response.split("<|end|>")[0].strip() demo = gr.Interface( fn=generate_sql, inputs=gr.Textbox(label="Your SQL question"), outputs="text", title="SQL Chatbot (ZeroGPU)", description="Free but limited daily GPU time" ) if __name__ == "__main__": demo.launch()