Spaces:
Sleeping
Sleeping
| # app.py - ZeroGPU compatible version (NO Unsloth) | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| from huggingface_hub import spaces # β important! | |
| # Your model paths | |
| BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" | |
| print("Loading model on CPU first... (will use GPU only during @spaces.GPU)") | |
| bnb_config = BitsAndBytesConfig(load_in_4bit=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| model = PeftModel.from_pretrained(model, LORA_PATH) | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| model.eval() | |
| # β this requests GPU slice only during this function | |
| def generate_sql(prompt: str): | |
| messages = [{"role": "user", "content": prompt}] | |
| inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=180, | |
| temperature=0.0, | |
| do_sample=False, | |
| use_cache=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| if "<|assistant|>" in response: | |
| response = response.split("<|assistant|>", 1)[-1].strip() | |
| return response.split("<|end|>")[0].strip() | |
| demo = gr.Interface( | |
| fn=generate_sql, | |
| inputs=gr.Textbox(label="Your SQL question"), | |
| outputs="text", | |
| title="SQL Chatbot (ZeroGPU)", | |
| description="Free but limited daily GPU time" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |