DimasMP3
refactor: Configure model for CPU-only execution by removing 4-bit quantization, setting float32 dtype, and updating UI descriptions and generation parameters.
cb9e216 | import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| MODEL_ID = "DimasMP3/qwen2.5-math-finetuned-7b" | |
| print(f"System: Loading model {MODEL_ID} on CPU...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True | |
| ) | |
| print("System: Model loaded!") | |
| def format_prompt(user_query): | |
| return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| Solve the following math problem step-by-step: | |
| {user_query} | |
| ### Response: | |
| """ | |
| def predict(message, history): | |
| prompt = format_prompt(message) | |
| inputs = tokenizer([prompt], return_tensors="pt") | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True, | |
| timeout=60.0 | |
| ) | |
| generation_kwargs = dict( | |
| inputs, | |
| streamer=streamer, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.3, | |
| top_p=0.9, | |
| repetition_penalty=1.1 | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| partial_text = "" | |
| for new_text in streamer: | |
| partial_text += new_text | |
| yield partial_text | |
| demo = gr.ChatInterface( | |
| fn=predict, | |
| title="Sultan Math AI Solver (CPU Mode)", | |
| description="Qwen 2.5 (7B) running on CPU. Might be slow!", | |
| examples=[ | |
| "Solve 3x + 10 = 25", | |
| ], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |