DimasMP3 commited on
Commit
cb9e216
·
1 Parent(s): cc1cfc8

refactor: Configure model for CPU-only execution by removing 4-bit quantization, setting float32 dtype, and updating UI descriptions and generation parameters.

Browse files
Files changed (1) hide show
  1. app.py +9 -20
app.py CHANGED
@@ -1,26 +1,17 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
4
  from threading import Thread
5
 
6
  MODEL_ID = "DimasMP3/qwen2.5-math-finetuned-7b"
7
 
8
- bnb_config = BitsAndBytesConfig(
9
- load_in_4bit=True,
10
- bnb_4bit_use_double_quant=True,
11
- bnb_4bit_quant_type="nf4",
12
- bnb_4bit_compute_dtype=torch.float16
13
- )
14
-
15
- print(f"System: Loading model {MODEL_ID}...")
16
 
17
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
 
19
- # 2. Load Model dengan Config Baru
20
  model = AutoModelForCausalLM.from_pretrained(
21
  MODEL_ID,
22
- quantization_config=bnb_config,
23
- device_map="auto",
24
  low_cpu_mem_usage=True
25
  )
26
 
@@ -38,19 +29,19 @@ Solve the following math problem step-by-step:
38
 
39
  def predict(message, history):
40
  prompt = format_prompt(message)
41
- inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
42
 
43
  streamer = TextIteratorStreamer(
44
  tokenizer,
45
  skip_prompt=True,
46
  skip_special_tokens=True,
47
- timeout=10.0
48
  )
49
 
50
  generation_kwargs = dict(
51
  inputs,
52
  streamer=streamer,
53
- max_new_tokens=1024,
54
  do_sample=True,
55
  temperature=0.3,
56
  top_p=0.9,
@@ -67,12 +58,10 @@ def predict(message, history):
67
 
68
  demo = gr.ChatInterface(
69
  fn=predict,
70
- title="LLM Math AI Solver",
71
- description="Qwen 2.5 (7B Parameters) Fine-Tuned Model for Mathematical Reasoning",
72
  examples=[
73
- "Solve the equation 3x + 10 = 25",
74
- "Calculate the derivative of f(x) = 4x^3 - 2x",
75
- "A triangle has a base of 10cm and a height of 5cm, what is its area?"
76
  ],
77
  cache_examples=False,
78
  )
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  from threading import Thread
5
 
6
  MODEL_ID = "DimasMP3/qwen2.5-math-finetuned-7b"
7
 
8
+ print(f"System: Loading model {MODEL_ID} on CPU...")
 
 
 
 
 
 
 
9
 
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
11
 
 
12
  model = AutoModelForCausalLM.from_pretrained(
13
  MODEL_ID,
14
+ torch_dtype=torch.float32,
 
15
  low_cpu_mem_usage=True
16
  )
17
 
 
29
 
30
  def predict(message, history):
31
  prompt = format_prompt(message)
32
+ inputs = tokenizer([prompt], return_tensors="pt")
33
 
34
  streamer = TextIteratorStreamer(
35
  tokenizer,
36
  skip_prompt=True,
37
  skip_special_tokens=True,
38
+ timeout=60.0
39
  )
40
 
41
  generation_kwargs = dict(
42
  inputs,
43
  streamer=streamer,
44
+ max_new_tokens=512,
45
  do_sample=True,
46
  temperature=0.3,
47
  top_p=0.9,
 
58
 
59
  demo = gr.ChatInterface(
60
  fn=predict,
61
+ title="Sultan Math AI Solver (CPU Mode)",
62
+ description="Qwen 2.5 (7B) running on CPU. Might be slow!",
63
  examples=[
64
+ "Solve 3x + 10 = 25",
 
 
65
  ],
66
  cache_examples=False,
67
  )