# CPU SAFE HuggingFace Space (2026 stable) import warnings warnings.filterwarnings("ignore") import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # reduce CPU overload on free tier torch.set_num_threads(1) # ───────────────────────── # Config # ───────────────────────── BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct" LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" MAX_NEW_TOKENS = 180 print("Loading model...") # ───────────────────────── # Load base model # ───────────────────────── model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="cpu", torch_dtype=torch.float32, trust_remote_code=True, low_cpu_mem_usage=True, ) print("Loading LoRA...") model = PeftModel.from_pretrained(model, LORA_PATH) print("Merging LoRA...") model = model.merge_and_unload() tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) model.eval() print("Model ready") # ───────────────────────── # Inference # ───────────────────────── def generate_sql(question): if not question: return "Enter a SQL question." messages = [{"role": "user", "content": question}] input_ids = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", ) with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=MAX_NEW_TOKENS, temperature=0, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) text = tokenizer.decode(output[0], skip_special_tokens=True) # clean artifacts for t in ["<|assistant|>", "<|user|>", "<|end|>"]: text = text.replace(t, "") return text.strip() # ───────────────────────── # UI # ───────────────────────── demo = gr.Interface( fn=generate_sql, inputs=gr.Textbox(lines=3, label="SQL Question"), outputs=gr.Textbox(lines=8, label="Generated SQL"), title="SQL Chat – Phi-3 mini", description="Free CPU Space. First response may take ~90s", cache_examples=False, ) demo.launch()