File size: 1,663 Bytes
c6791fe cb9e216 c6791fe cb9e216 cc1cfc8 c6791fe 959b2d1 c6791fe cb9e216 c6791fe cc1cfc8 c6791fe cb9e216 c6791fe cb9e216 c6791fe cb9e216 c6791fe cb9e216 c6791fe cb9e216 c6791fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
MODEL_ID = "DimasMP3/qwen2.5-math-finetuned-7b"
print(f"System: Loading model {MODEL_ID} on CPU...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
print("System: Model loaded!")
def format_prompt(user_query):
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Solve the following math problem step-by-step:
{user_query}
### Response:
"""
def predict(message, history):
prompt = format_prompt(message)
inputs = tokenizer([prompt], return_tensors="pt")
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
timeout=60.0
)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=512,
do_sample=True,
temperature=0.3,
top_p=0.9,
repetition_penalty=1.1
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
demo = gr.ChatInterface(
fn=predict,
title="Sultan Math AI Solver (CPU Mode)",
description="Qwen 2.5 (7B) running on CPU. Might be slow!",
examples=[
"Solve 3x + 10 = 25",
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch() |