Spaces:
Runtime error
Runtime error
File size: 4,314 Bytes
4bc3e8b e62bece c7c0d53 7f424d1 02976e0 e62bece 7f424d1 00c8a57 e62bece 4bc3e8b e62bece 4bc3e8b 02976e0 4bc3e8b a2f39c6 e62bece 4bc3e8b e62bece 8b67be0 e62bece 8b67be0 e62bece 02976e0 e62bece a2f39c6 e62bece a2f39c6 e62bece 4bc3e8b e62bece 22df2c5 e62bece 7f424d1 22df2c5 e62bece 02976e0 e62bece 84031c5 e62bece 8b67be0 e62bece 4bc3e8b e62bece 84031c5 e62bece 8b67be0 e62bece 8b67be0 e62bece 02976e0 e62bece 02976e0 4bc3e8b e62bece 32343cc e62bece 84031c5 1344c31 e62bece 84031c5 e62bece 9ae2c39 e62bece | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | # app.py
# Stable CPU-only Hugging Face Space
# Phi-3-mini + LoRA (NO bitsandbytes, NO SSR issues)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
# βββββββββββββββββββββββββββββββββββββββββββββ
# Config
# βββββββββββββββββββββββββββββββββββββββββββββ
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
LORA_PATH = "saadkhi/SQL_Chat_finetuned_model"
MAX_NEW_TOKENS = 180
TEMPERATURE = 0.0
DO_SAMPLE = False
# βββββββββββββββββββββββββββββββββββββββββββββ
# Load model & tokenizer (CPU SAFE)
# βββββββββββββββββββββββββββββββββββββββββββββ
print("Loading base model on CPU...")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="cpu",
torch_dtype=torch.float32,
trust_remote_code=True,
low_cpu_mem_usage=True,
)
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(model, LORA_PATH)
print("Merging LoRA weights...")
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model.eval()
print("Model & tokenizer loaded successfully")
# βββββββββββββββββββββββββββββββββββββββββββββ
# Inference
# βββββββββββββββββββββββββββββββββββββββββββββ
def generate_sql(question: str) -> str:
if not question or not question.strip():
return "Please enter a SQL-related question."
messages = [
{"role": "user", "content": question.strip()}
]
input_ids = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
)
with torch.inference_mode():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
pad_token_id=tokenizer.eos_token_id,
use_cache=True,
)
response = tokenizer.decode(
output_ids[0],
skip_special_tokens=True
)
# Clean Phi-3 chat artifacts
for token in ["<|assistant|>", "<|user|>", "<|end|>"]:
if token in response:
response = response.split(token)[-1]
return response.strip() or "(empty response)"
# βββββββββββββββββββββββββββββββββββββββββββββ
# Gradio UI
# βββββββββββββββββββββββββββββββββββββββββββββ
demo = gr.Interface(
fn=generate_sql,
inputs=gr.Textbox(
label="SQL Question",
placeholder="Find duplicate emails in users table",
lines=3,
),
outputs=gr.Textbox(
label="Generated SQL",
lines=8,
),
title="SQL Chat β Phi-3-mini (CPU)",
description=(
"CPU-only Hugging Face Space.\n"
"First response may take 60β180 seconds. "
"Subsequent requests are faster."
),
examples=[
["Find duplicate emails in users table"],
["Top 5 highest paid employees"],
["Count orders per customer last month"],
["Delete duplicate rows based on email"],
],
cache_examples=False,
)
# βββββββββββββββββββββββββββββββββββββββββββββ
# Launch
# βββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
print("Launching Gradio interface...")
demo.launch(
server_name="0.0.0.0",
ssr_mode=False, # important: avoids asyncio FD bug
show_error=True,
)
|