DimasMP3 commited on
Commit
c6791fe
·
1 Parent(s): 4e3af76

Deploy Sultan Math AI: Add Gradio interface and model configuration

Browse files

This commit sets up the inference engine for the fine-tuned Qwen 2.5 Math model.
Key changes:
1. Enabled 4-bit quantization (bitsandbytes) for memory efficiency on CPU/T4.
2. Configured English system prompts for better reasoning.
3. Added threading support for streaming responses.

Files changed (2) hide show
  1. app.py +70 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from threading import Thread
5
+
6
+ MODEL_ID = "DimasMP3/qwen2.5-math-finetuned-7b"
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ MODEL_ID,
11
+ torch_dtype=torch.float16,
12
+ device_map="auto",
13
+ load_in_4bit=True,
14
+ low_cpu_mem_usage=True
15
+ )
16
+
17
+ def format_prompt(user_query):
18
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
19
+
20
+ ### Instruction:
21
+ Solve the following math problem step-by-step:
22
+ {user_query}
23
+
24
+ ### Response:
25
+ """
26
+
27
+ def predict(message, history):
28
+ prompt = format_prompt(message)
29
+ inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
30
+
31
+ streamer = TextIteratorStreamer(
32
+ tokenizer,
33
+ skip_prompt=True,
34
+ skip_special_tokens=True,
35
+ timeout=10.0
36
+ )
37
+
38
+ generation_kwargs = dict(
39
+ inputs,
40
+ streamer=streamer,
41
+ max_new_tokens=1024,
42
+ do_sample=True,
43
+ temperature=0.3,
44
+ top_p=0.9,
45
+ repetition_penalty=1.1
46
+ )
47
+
48
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
49
+ thread.start()
50
+
51
+ partial_text = ""
52
+ for new_text in streamer:
53
+ partial_text += new_text
54
+ yield partial_text
55
+
56
+ demo = gr.ChatInterface(
57
+ fn=predict,
58
+ title="Sultan Math AI Solver",
59
+ description="Qwen 2.5 (7B Parameters) Fine-Tuned Model for Mathematical Reasoning",
60
+ examples=[
61
+ "Solve the equation 3x + 10 = 25",
62
+ "Calculate the derivative of f(x) = 4x^3 - 2x",
63
+ "A triangle has a base of 10cm and a height of 5cm, what is its area?"
64
+ ],
65
+ theme="soft",
66
+ cache_examples=False,
67
+ )
68
+
69
+ if __name__ == "__main__":
70
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch>=2.2.0
2
+ transformers>=4.40.0
3
+ accelerate>=0.29.0
4
+ bitsandbytes>=0.43.0
5
+ scipy
6
+ gradio>=4.0.0
7
+ sentencepiece
8
+ protobuf