File size: 2,537 Bytes
e95c2d3
 
e62bece
7f3026b
c7c0d53
7f424d1
02976e0
e62bece
7f424d1
00c8a57
e95c2d3
 
 
 
 
 
e62bece
4bc3e8b
02976e0
 
4bc3e8b
e95c2d3
 
 
 
 
 
 
 
 
 
 
 
7f3026b
e95c2d3
 
7f3026b
e95c2d3
 
8b67be0
e95c2d3
8b67be0
e95c2d3
 
02976e0
e95c2d3
 
 
7f3026b
e95c2d3
 
a2f39c6
7f3026b
4bc3e8b
e62bece
 
 
 
 
 
22df2c5
e95c2d3
 
 
e62bece
e95c2d3
7f3026b
e62bece
7f424d1
22df2c5
e95c2d3
02976e0
e95c2d3
7f3026b
e95c2d3
84031c5
e95c2d3
8b67be0
e95c2d3
 
 
84031c5
e62bece
7f3026b
e95c2d3
 
 
 
84031c5
1344c31
e95c2d3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# CPU SAFE HuggingFace Space (2026 stable)

import warnings
warnings.filterwarnings("ignore")

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# reduce CPU overload on free tier
torch.set_num_threads(1)

# ─────────────────────────
# Config
# ─────────────────────────
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"

MAX_NEW_TOKENS = 180

print("Loading model...")

# ─────────────────────────
# Load base model
# ─────────────────────────
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="cpu",
    torch_dtype=torch.float32,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

print("Loading LoRA...")
model = PeftModel.from_pretrained(model, LORA_PATH)

print("Merging LoRA...")
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model.eval()
print("Model ready")

# ─────────────────────────
# Inference
# ─────────────────────────
def generate_sql(question):
    if not question:
        return "Enter a SQL question."

    messages = [{"role": "user", "content": question}]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    # clean artifacts
    for t in ["<|assistant|>", "<|user|>", "<|end|>"]:
        text = text.replace(t, "")

    return text.strip()

# ─────────────────────────
# UI
# ─────────────────────────
demo = gr.Interface(
    fn=generate_sql,
    inputs=gr.Textbox(lines=3, label="SQL Question"),
    outputs=gr.Textbox(lines=8, label="Generated SQL"),
    title="SQL Chat – Phi-3 mini",
    description="Free CPU Space. First response may take ~90s",
    cache_examples=False,
)

demo.launch()