File size: 4,314 Bytes
4bc3e8b
e62bece
 
 
 
 
c7c0d53
7f424d1
02976e0
e62bece
7f424d1
00c8a57
e62bece
4bc3e8b
e62bece
 
4bc3e8b
02976e0
 
4bc3e8b
 
a2f39c6
e62bece
 
 
 
 
 
 
 
 
 
 
 
4bc3e8b
e62bece
 
8b67be0
e62bece
 
8b67be0
e62bece
02976e0
e62bece
 
a2f39c6
e62bece
 
 
 
 
 
a2f39c6
e62bece
 
 
4bc3e8b
e62bece
 
 
 
 
 
22df2c5
e62bece
 
 
 
 
 
 
 
7f424d1
22df2c5
e62bece
 
 
 
02976e0
e62bece
 
 
 
84031c5
e62bece
8b67be0
e62bece
4bc3e8b
e62bece
84031c5
e62bece
 
 
 
 
8b67be0
e62bece
 
 
8b67be0
e62bece
 
 
 
 
02976e0
e62bece
02976e0
4bc3e8b
 
e62bece
32343cc
e62bece
84031c5
1344c31
e62bece
 
 
84031c5
e62bece
9ae2c39
e62bece
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# app.py
# Stable CPU-only Hugging Face Space
# Phi-3-mini + LoRA (NO bitsandbytes, NO SSR issues)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# ─────────────────────────────────────────────
# Config
# ─────────────────────────────────────────────
BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct"
LORA_PATH  = "saadkhi/SQL_Chat_finetuned_model"

MAX_NEW_TOKENS = 180
TEMPERATURE    = 0.0
DO_SAMPLE      = False

# ─────────────────────────────────────────────
# Load model & tokenizer (CPU SAFE)
# ─────────────────────────────────────────────
print("Loading base model on CPU...")

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="cpu",
    torch_dtype=torch.float32,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(model, LORA_PATH)

print("Merging LoRA weights...")
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model.eval()
print("Model & tokenizer loaded successfully")

# ─────────────────────────────────────────────
# Inference
# ─────────────────────────────────────────────
def generate_sql(question: str) -> str:
    if not question or not question.strip():
        return "Please enter a SQL-related question."

    messages = [
        {"role": "user", "content": question.strip()}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=DO_SAMPLE,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )

    response = tokenizer.decode(
        output_ids[0],
        skip_special_tokens=True
    )

    # Clean Phi-3 chat artifacts
    for token in ["<|assistant|>", "<|user|>", "<|end|>"]:
        if token in response:
            response = response.split(token)[-1]

    return response.strip() or "(empty response)"

# ─────────────────────────────────────────────
# Gradio UI
# ─────────────────────────────────────────────
demo = gr.Interface(
    fn=generate_sql,
    inputs=gr.Textbox(
        label="SQL Question",
        placeholder="Find duplicate emails in users table",
        lines=3,
    ),
    outputs=gr.Textbox(
        label="Generated SQL",
        lines=8,
    ),
    title="SQL Chat – Phi-3-mini (CPU)",
    description=(
        "CPU-only Hugging Face Space.\n"
        "First response may take 60–180 seconds. "
        "Subsequent requests are faster."
    ),
    examples=[
        ["Find duplicate emails in users table"],
        ["Top 5 highest paid employees"],
        ["Count orders per customer last month"],
        ["Delete duplicate rows based on email"],
    ],
    cache_examples=False,
)

# ─────────────────────────────────────────────
# Launch
# ─────────────────────────────────────────────
if __name__ == "__main__":
    print("Launching Gradio interface...")
    demo.launch(
        server_name="0.0.0.0",
        ssr_mode=False,   # important: avoids asyncio FD bug
        show_error=True,
    )