FractalAIR commited on
Commit
9b6a25b
·
verified ·
1 Parent(s): f50673c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -117
app.py CHANGED
@@ -1,145 +1,202 @@
1
- import gradio as gr
2
  import spaces
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
 
 
5
 
6
- MODEL_ID = "FractalAIResearch/Fathom-R1-14B"
7
-
8
- # Load model and tokenizer OUTSIDE the GPU function (following official docs)
9
- print("Loading model and tokenizer...")
10
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
11
- model = AutoModelForCausalLM.from_pretrained(
12
- MODEL_ID,
13
- torch_dtype=torch.bfloat16,
14
- trust_remote_code=True
15
- )
16
 
 
 
17
  if tokenizer.pad_token is None:
18
  tokenizer.pad_token = tokenizer.eos_token
19
 
20
- # Move to GPU (following official docs pattern)
21
- model.to('cuda')
22
- print("Model loaded and moved to GPU")
23
 
24
- @spaces.GPU
25
- def generate_response(message, history, max_tokens, temperature):
26
- try:
27
- # Simple prompt format
28
- prompt = f"User: {message}\nAssistant:"
29
-
30
- # Tokenize
31
- inputs = tokenizer(prompt, return_tensors="pt")
32
- inputs = {k: v.to('cuda') for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Generate
35
- with torch.no_grad():
36
  outputs = model.generate(
37
- **inputs,
38
- max_new_tokens=max_tokens,
39
  temperature=temperature,
 
40
  do_sample=True,
41
  pad_token_id=tokenizer.eos_token_id,
42
- eos_token_id=tokenizer.eos_token_id,
43
  )
44
-
45
- # Decode response
46
- response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
47
-
48
- # Update history
49
- history.append([message, response])
50
- return history, history, ""
51
-
52
- except Exception as e:
53
- error_msg = f"Error: {str(e)}"
54
- history.append([message, error_msg])
55
- return history, history, ""
56
-
57
- # Create Gradio interface
58
- with gr.Blocks(title="Fathom R1 14B Chatbot") as demo:
59
- gr.HTML("<h1>🤖 Fathom R1 14B Chatbot</h1>")
60
-
61
- with gr.Row():
62
- with gr.Column(scale=3):
63
- chatbot = gr.Chatbot(height=500, label="Conversation")
64
 
65
- with gr.Row():
66
- msg = gr.Textbox(
67
- placeholder="Type your message here...",
68
- label="Message",
69
- lines=3,
70
- scale=4
71
- )
72
- send_btn = gr.Button("Send", variant="primary", scale=1)
 
 
73
 
74
- clear_btn = gr.Button("Clear Chat")
 
75
 
76
- with gr.Column(scale=1):
77
- gr.Markdown("### Settings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  max_tokens = gr.Slider(
79
- minimum=50,
80
- maximum=1024,
81
- value=256,
82
- step=50,
83
  label="Max Tokens"
84
  )
85
  temperature = gr.Slider(
86
- minimum=0.1,
87
- maximum=2.0,
88
- value=0.7,
89
- step=0.1,
90
  label="Temperature"
91
  )
92
-
93
- gr.Markdown("### Examples")
94
- gr.Examples(
95
- examples=[
96
- "Solve: 2x + 5 = 15",
97
- "Explain quantum mechanics simply",
98
- "What is the derivative of x²?",
99
- ],
100
- inputs=msg
101
  )
102
 
103
- # Chat history state
104
- history = gr.State([])
105
-
106
- # Event handlers
107
- def user_submit(message, hist):
108
- return hist + [[message, None]], hist + [[message, None]], ""
109
-
110
- def bot_respond(hist, max_tok, temp):
111
- if hist and hist[-1][1] is None:
112
- message = hist[-1][0]
113
- _, updated_hist, _ = generate_response(message, hist[:-1], max_tok, temp)
114
- return updated_hist, updated_hist
115
- return hist, hist
116
-
117
- # Submit message
118
- msg.submit(
119
- user_submit,
120
- [msg, history],
121
- [chatbot, history, msg]
122
- ).then(
123
- bot_respond,
124
- [history, max_tokens, temperature],
125
- [chatbot, history]
126
- )
127
-
128
- send_btn.click(
129
- user_submit,
130
- [msg, history],
131
- [chatbot, history, msg]
132
- ).then(
133
- bot_respond,
134
- [history, max_tokens, temperature],
135
- [chatbot, history]
136
- )
137
-
138
- # Clear chat
139
- clear_btn.click(
140
- lambda: ([], []),
141
- outputs=[chatbot, history]
142
- )
143
 
 
144
  if __name__ == "__main__":
 
145
  demo.launch()
 
 
1
  import spaces
2
+ import gradio as gr
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ import time
6
 
7
+ # Load model and tokenizer
8
+ model_name = "FractalAIResearch/Fathom-R1-14B"
 
 
 
 
 
 
 
 
9
 
10
+ # Initialize tokenizer (can be done outside GPU decorator)
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  if tokenizer.pad_token is None:
13
  tokenizer.pad_token = tokenizer.eos_token
14
 
15
+ # Global model variable
16
+ model = None
 
17
 
18
+ def load_model():
19
+ """Load model on GPU"""
20
+ global model
21
+ if model is None:
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_name,
24
+ torch_dtype=torch.bfloat16,
25
+ device_map="auto",
26
+ trust_remote_code=True
27
+ )
28
+ return model
29
+
30
+ @spaces.GPU #(duration=120) # Allow up to 2 minutes for generation
31
+ def generate_response(message, history, max_tokens=1024, temperature=0.7, top_p=0.9):
32
+ """Generate response using Fathom-R1-14B"""
33
+
34
+ # Load model on GPU
35
+ model = load_model()
36
+
37
+ # Format conversation history
38
+ conversation = []
39
+ for exchange in history:
40
+ if exchange['role'] == 'user':
41
+ conversation.append(f"User: {exchange['content']}")
42
+ else:
43
+ conversation.append(f"Assistant: {exchange['content']}")
44
+
45
+ # Add current message
46
+ conversation.append(f"User: {message}")
47
+ conversation.append("Assistant:")
48
+
49
+ # Create prompt
50
+ prompt = "\n".join(conversation)
51
+
52
+ # Tokenize
53
+ inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
54
+
55
+ # Generate with streaming
56
+ with torch.no_grad():
57
+ streamer_output = ""
58
 
59
+ # Generate tokens one by one for streaming effect
60
+ for _ in range(max_tokens):
61
  outputs = model.generate(
62
+ inputs,
63
+ max_new_tokens=1,
64
  temperature=temperature,
65
+ top_p=top_p,
66
  do_sample=True,
67
  pad_token_id=tokenizer.eos_token_id,
68
+ eos_token_id=tokenizer.eos_token_id
69
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Get new token
72
+ new_token = outputs[0, -1:]
73
+ new_text = tokenizer.decode(new_token, skip_special_tokens=True)
74
+
75
+ # Check for end of sequence
76
+ if new_token.item() == tokenizer.eos_token_id:
77
+ break
78
+
79
+ streamer_output += new_text
80
+ inputs = outputs
81
 
82
+ # Yield partial response for streaming
83
+ yield streamer_output
84
 
85
+ # Small delay for streaming effect
86
+ time.sleep(0.05)
87
+
88
+ # Alternative non-streaming version for faster response
89
+ @spaces.GPU(duration=60)
90
+ def generate_response_fast(message, history, max_tokens=1024, temperature=0.7, top_p=0.9):
91
+ """Generate response quickly without streaming"""
92
+
93
+ # Load model on GPU
94
+ model = load_model()
95
+
96
+ # Format conversation history
97
+ conversation = []
98
+ for exchange in history:
99
+ if exchange['role'] == 'user':
100
+ conversation.append(f"User: {exchange['content']}")
101
+ else:
102
+ conversation.append(f"Assistant: {exchange['content']}")
103
+
104
+ # Add current message
105
+ conversation.append(f"User: {message}")
106
+ conversation.append("Assistant:")
107
+
108
+ # Create prompt
109
+ prompt = "\n".join(conversation)
110
+
111
+ # Tokenize
112
+ inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
113
+
114
+ # Generate response
115
+ with torch.no_grad():
116
+ outputs = model.generate(
117
+ inputs,
118
+ max_new_tokens=max_tokens,
119
+ temperature=temperature,
120
+ top_p=top_p,
121
+ do_sample=True,
122
+ pad_token_id=tokenizer.eos_token_id,
123
+ eos_token_id=tokenizer.eos_token_id
124
+ )
125
+
126
+ # Decode response
127
+ response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
128
+ return response.strip()
129
+
130
+ # Create Gradio interface
131
+ def create_interface():
132
+ with gr.Blocks(title="Fathom-R1-14B Chatbot") as demo:
133
+ gr.Markdown("""
134
+ # 🧠 Fathom-R1-14B Reasoning Chatbot
135
+
136
+ Powered by **FractalAI Research's Fathom-R1-14B** - a 14B parameter model optimized for mathematical and scientific reasoning tasks.
137
+
138
+ This model excels at:
139
+ - Complex mathematical problems
140
+ - Scientific reasoning
141
+ - Step-by-step problem solving
142
+ - Logical analysis
143
+ """)
144
+
145
+ with gr.Tab("Streaming Chat"):
146
+ chat_streaming = gr.ChatInterface(
147
+ fn=generate_response,
148
+ type="messages",
149
+ title="Streaming Response",
150
+ description="Get streaming responses (slower but shows progress)",
151
+ examples=[
152
+ "Solve this math problem: If a train travels 120 km in 2 hours, what's its average speed?",
153
+ "Explain the concept of photosynthesis step by step",
154
+ "What is the derivative of x^3 + 2x^2 - 5x + 3?",
155
+ "How do you calculate the area of a circle with radius 7?"
156
+ ]
157
+ )
158
+
159
+ with gr.Tab("Fast Chat"):
160
+ chat_fast = gr.ChatInterface(
161
+ fn=generate_response_fast,
162
+ type="messages",
163
+ title="Quick Response",
164
+ description="Get faster responses without streaming",
165
+ examples=[
166
+ "What is 15% of 240?",
167
+ "Explain Newton's first law of motion",
168
+ "How do you solve quadratic equations?",
169
+ "What is the Pythagorean theorem?"
170
+ ]
171
+ )
172
+
173
+ with gr.Tab("Settings"):
174
+ gr.Markdown("### Generation Parameters")
175
  max_tokens = gr.Slider(
176
+ minimum=64,
177
+ maximum=2048,
178
+ value=1024,
179
+ step=64,
180
  label="Max Tokens"
181
  )
182
  temperature = gr.Slider(
183
+ minimum=0.1,
184
+ maximum=2.0,
185
+ value=0.7,
186
+ step=0.1,
187
  label="Temperature"
188
  )
189
+ top_p = gr.Slider(
190
+ minimum=0.1,
191
+ maximum=1.0,
192
+ value=0.9,
193
+ step=0.05,
194
+ label="Top P"
 
 
 
195
  )
196
 
197
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ # Launch the interface
200
  if __name__ == "__main__":
201
+ demo = create_interface()
202
  demo.launch()