Fathom

Sleeping

App Files Files Community

FractalAIR commited on Aug 25

Commit

3b68d21

verified ·

1 Parent(s): c34c8d5

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -22

app.py CHANGED Viewed

@@ -8,36 +8,29 @@ MODEL_ID = "FractalAIResearch/Fathom-R1-14B"
 @spaces.GPU
 def chat_with_model(message, history, max_tokens, temperature):
     try:
-        print("🔥 GPU allocated, loading model...")
-        # Load model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
-            trust_remote_code=True
         )
-        # EXPLICITLY move model to GPU
-        model = model.cuda()
-        print(f"✅ Model loaded on device: {model.device}")
-        print(f"🔥 GPU available: {torch.cuda.is_available()}")
-        print(f"🔥 GPU device count: {torch.cuda.device_count()}")
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
-        # Tokenize and move to GPU
         inputs = tokenizer(prompt, return_tensors="pt")
-        inputs = {k: v.cuda() for k, v in inputs.items()}
-        print(f"✅ Inputs moved to: {inputs['input_ids'].device}")
-        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -51,15 +44,15 @@ def chat_with_model(message, history, max_tokens, temperature):
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
-        print(f"✅ Generated response: {response[:100]}...")
         # Update history
         history.append([message, response])
         return history, history, ""
     except Exception as e:
-        error_msg = f"❌ Error: {str(e)}"
-        print(error_msg)
         history.append([message, error_msg])
         return history, history, ""
@@ -86,8 +79,8 @@ with gr.Blocks(title="Fathom R1 14B Chatbot") as demo:
             gr.Markdown("### Settings")
             max_tokens = gr.Slider(
                 minimum=50,
-                maximum=2048,
-                value=512,
                 step=50,
                 label="Max Tokens"
             )
@@ -103,7 +96,7 @@ with gr.Blocks(title="Fathom R1 14B Chatbot") as demo:
             gr.Examples(
                 examples=[
                     "Solve: 2x + 5 = 15",
-                    "Explain quantum mechanics simply",
                     "What is the derivative of x²?",
                 ],
                 inputs=msg

 @spaces.GPU
 def chat_with_model(message, history, max_tokens, temperature):
     try:
+        print("Loading model...")
+        # Load model and tokenizer - let ZeroGPU handle device placement
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            device_map="auto"  # Let transformers handle GPU placement
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        print(f"Model loaded successfully on device: {next(model.parameters()).device}")
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
+        # Tokenize - let the model handle device placement
         inputs = tokenizer(prompt, return_tensors="pt")
+        # Generate - the model will automatically handle device placement
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
         # Update history
         history.append([message, response])
         return history, history, ""
     except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        print(f"Full error: {e}")
+        import traceback
+        traceback.print_exc()
         history.append([message, error_msg])
         return history, history, ""
             gr.Markdown("### Settings")
             max_tokens = gr.Slider(
                 minimum=50,
+                maximum=1024,
+                value=256,
                 step=50,
                 label="Max Tokens"
             )
             gr.Examples(
                 examples=[
                     "Solve: 2x + 5 = 15",
+                    "Explain quantum mechanics simply",
                     "What is the derivative of x²?",
                 ],
                 inputs=msg