Update app.py
Browse files
app.py
CHANGED
|
@@ -12,8 +12,9 @@ print(f"Loading {MODEL_ID} to CPU...")
|
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 13 |
model = AutoModelForCausalLM.from_pretrained(
|
| 14 |
MODEL_ID,
|
| 15 |
-
dtype=
|
| 16 |
-
device_map="cpu"
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
def get_system_stats():
|
|
@@ -22,11 +23,22 @@ def get_system_stats():
|
|
| 22 |
return f"Available RAM: {available_gb:.2f} GB"
|
| 23 |
|
| 24 |
def chat(history, system_prompt, temp, top_p, max_tokens, rep_penalty):
|
| 25 |
-
#
|
| 26 |
-
messages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
for msg in history:
|
| 28 |
-
messages.append(
|
|
|
|
|
|
|
|
|
|
| 29 |
|
|
|
|
|
|
|
| 30 |
model_inputs = tokenizer.apply_chat_template(
|
| 31 |
messages,
|
| 32 |
tokenize=True,
|
|
@@ -40,11 +52,11 @@ def chat(history, system_prompt, temp, top_p, max_tokens, rep_penalty):
|
|
| 40 |
generation_kwargs = dict(
|
| 41 |
**model_inputs,
|
| 42 |
streamer=streamer,
|
| 43 |
-
max_new_tokens=max_tokens,
|
| 44 |
do_sample=True if temp > 0 else False,
|
| 45 |
-
temperature=temp,
|
| 46 |
-
top_p=top_p,
|
| 47 |
-
repetition_penalty=rep_penalty,
|
| 48 |
)
|
| 49 |
|
| 50 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
@@ -62,40 +74,27 @@ def chat(history, system_prompt, temp, top_p, max_tokens, rep_penalty):
|
|
| 62 |
stats = f"**Stats:** {tps:.2f} tokens/sec | {get_system_stats()}"
|
| 63 |
yield generated_text, stats
|
| 64 |
|
| 65 |
-
with gr.Blocks(
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
gr.Markdown("### 🛠️ ML Engineer Controls")
|
| 69 |
-
|
| 70 |
system_input = gr.Textbox(
|
| 71 |
-
value="You are
|
| 72 |
label="System Prompt",
|
| 73 |
-
lines=
|
| 74 |
)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
rep_penalty_slider = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty")
|
| 80 |
-
|
| 81 |
-
with gr.Accordion("Response Limits", open=False):
|
| 82 |
-
max_tokens_slider = gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Max New Tokens")
|
| 83 |
-
|
| 84 |
gr.Markdown("---")
|
| 85 |
stats_output = gr.Markdown("Stats: System Ready")
|
| 86 |
|
| 87 |
-
# -
|
| 88 |
-
gr.
|
| 89 |
-
|
| 90 |
-
chatbot = gr.Chatbot(label="Qwen 0.8B (CPU)")
|
| 91 |
|
| 92 |
with gr.Row():
|
| 93 |
-
msg = gr.Textbox(
|
| 94 |
-
|
| 95 |
-
label="Input",
|
| 96 |
-
scale=4
|
| 97 |
-
)
|
| 98 |
-
clear = gr.Button("🗑️", scale=1)
|
| 99 |
|
| 100 |
def user_action(user_message, history):
|
| 101 |
if history is None: history = []
|
|
@@ -104,22 +103,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 104 |
|
| 105 |
def bot_action(history, sys_prompt, temp, top_p, max_t, rep_p):
|
| 106 |
history.append({"role": "assistant", "content": ""})
|
| 107 |
-
|
| 108 |
-
# Pulling all settings into the chat function
|
| 109 |
for partial_text, stats in chat(history[:-1], sys_prompt, temp, top_p, max_t, rep_p):
|
| 110 |
history[-1]["content"] = partial_text
|
| 111 |
yield history, stats
|
| 112 |
|
| 113 |
-
|
| 114 |
-
msg.submit(
|
| 115 |
-
user_action, [msg, chatbot], [msg, chatbot], queue=False
|
| 116 |
-
).then(
|
| 117 |
bot_action,
|
| 118 |
[chatbot, system_input, temp_slider, top_p_slider, max_tokens_slider, rep_penalty_slider],
|
| 119 |
[chatbot, stats_output]
|
| 120 |
)
|
| 121 |
-
|
| 122 |
clear.click(lambda: [], None, chatbot, queue=False)
|
| 123 |
|
| 124 |
if __name__ == "__main__":
|
| 125 |
-
|
|
|
|
|
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 13 |
model = AutoModelForCausalLM.from_pretrained(
|
| 14 |
MODEL_ID,
|
| 15 |
+
dtype="auto", # Recommended by Phi-4 README
|
| 16 |
+
device_map="cpu",
|
| 17 |
+
trust_remote_code=True
|
| 18 |
)
|
| 19 |
|
| 20 |
def get_system_stats():
|
|
|
|
| 23 |
return f"Available RAM: {available_gb:.2f} GB"
|
| 24 |
|
| 25 |
def chat(history, system_prompt, temp, top_p, max_tokens, rep_penalty):
|
| 26 |
+
# Phi-4 requires a very specific list format
|
| 27 |
+
messages = []
|
| 28 |
+
|
| 29 |
+
# 1. Add System Prompt
|
| 30 |
+
if system_prompt:
|
| 31 |
+
messages.append({"role": "system", "content": str(system_prompt)})
|
| 32 |
+
|
| 33 |
+
# 2. Add History (ensuring all content is strictly string type)
|
| 34 |
for msg in history:
|
| 35 |
+
messages.append({
|
| 36 |
+
"role": msg["role"],
|
| 37 |
+
"content": str(msg["content"])
|
| 38 |
+
})
|
| 39 |
|
| 40 |
+
# Phi-4 templates in transformers 4.49.0+ are strict about 'return_full_text'
|
| 41 |
+
# and the jinja rendering. We use the tokenizer's built-in template logic:
|
| 42 |
model_inputs = tokenizer.apply_chat_template(
|
| 43 |
messages,
|
| 44 |
tokenize=True,
|
|
|
|
| 52 |
generation_kwargs = dict(
|
| 53 |
**model_inputs,
|
| 54 |
streamer=streamer,
|
| 55 |
+
max_new_tokens=int(max_tokens),
|
| 56 |
do_sample=True if temp > 0 else False,
|
| 57 |
+
temperature=float(temp) if temp > 0 else 1.0, # Avoid 0.0 temp error in some torch versions
|
| 58 |
+
top_p=float(top_p),
|
| 59 |
+
repetition_penalty=float(rep_penalty),
|
| 60 |
)
|
| 61 |
|
| 62 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
|
|
|
| 74 |
stats = f"**Stats:** {tps:.2f} tokens/sec | {get_system_stats()}"
|
| 75 |
yield generated_text, stats
|
| 76 |
|
| 77 |
+
with gr.Blocks() as demo:
|
| 78 |
+
with gr.Sidebar(label="ML Settings", open=False):
|
| 79 |
+
gr.Markdown("### 🛠️ Persona & Engine")
|
|
|
|
|
|
|
| 80 |
system_input = gr.Textbox(
|
| 81 |
+
value="You are an individual named Arudra. You follow instructions strictly.",
|
| 82 |
label="System Prompt",
|
| 83 |
+
lines=4
|
| 84 |
)
|
| 85 |
+
temp_slider = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
|
| 86 |
+
top_p_slider = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-P")
|
| 87 |
+
rep_penalty_slider = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
|
| 88 |
+
max_tokens_slider = gr.Slider(64, 2048, 512, step=64, label="Max Tokens")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
gr.Markdown("---")
|
| 90 |
stats_output = gr.Markdown("Stats: System Ready")
|
| 91 |
|
| 92 |
+
gr.Markdown("# Phi-4 Mini Engineering Console")
|
| 93 |
+
chatbot = gr.Chatbot(label="Phi-4 Mini")
|
|
|
|
|
|
|
| 94 |
|
| 95 |
with gr.Row():
|
| 96 |
+
msg = gr.Textbox(placeholder="Enter message...", scale=4, label="Input")
|
| 97 |
+
clear = gr.Button("Clear", scale=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
def user_action(user_message, history):
|
| 100 |
if history is None: history = []
|
|
|
|
| 103 |
|
| 104 |
def bot_action(history, sys_prompt, temp, top_p, max_t, rep_p):
|
| 105 |
history.append({"role": "assistant", "content": ""})
|
| 106 |
+
# History minus the empty slot we just added
|
|
|
|
| 107 |
for partial_text, stats in chat(history[:-1], sys_prompt, temp, top_p, max_t, rep_p):
|
| 108 |
history[-1]["content"] = partial_text
|
| 109 |
yield history, stats
|
| 110 |
|
| 111 |
+
msg.submit(user_action, [msg, chatbot], [msg, chatbot], queue=False).then(
|
|
|
|
|
|
|
|
|
|
| 112 |
bot_action,
|
| 113 |
[chatbot, system_input, temp_slider, top_p_slider, max_tokens_slider, rep_penalty_slider],
|
| 114 |
[chatbot, stats_output]
|
| 115 |
)
|
|
|
|
| 116 |
clear.click(lambda: [], None, chatbot, queue=False)
|
| 117 |
|
| 118 |
if __name__ == "__main__":
|
| 119 |
+
# Theme is passed here for Gradio 6 compatibility
|
| 120 |
+
demo.launch(theme=gr.themes.Soft())
|