Fathom

Sleeping

App Files Files Community

FractalAIR commited on Aug 25

Commit

f50673c

verified ·

1 Parent(s): 18c4432

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -35

app.py CHANGED Viewed

@@ -5,41 +5,31 @@ import torch
 MODEL_ID = "FractalAIResearch/Fathom-R1-14B"
 @spaces.GPU
-def chat_with_model(message, history, max_tokens, temperature):
     try:
-        print("Loading model...")
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-        # Load model WITHOUT device_map to avoid CPU placement
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True,
-            # Remove device_map="auto" - it's causing CPU placement
-        )
-        # AFTER loading, move to GPU explicitly
-        if torch.cuda.is_available():
-            model = model.to('cuda')
-            print(f"✅ Model moved to GPU: {next(model.parameters()).device}")
-        else:
-            print("❌ No GPU available")
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
-        # Tokenize and move inputs to same device as model
         inputs = tokenizer(prompt, return_tensors="pt")
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        print(f"✅ Inputs on device: {inputs['input_ids'].device}")
         # Generate
         with torch.no_grad():
@@ -55,17 +45,12 @@ def chat_with_model(message, history, max_tokens, temperature):
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
-        print(f"✅ Generated: {response[:50]}...")
         # Update history
         history.append([message, response])
         return history, history, ""
     except Exception as e:
         error_msg = f"Error: {str(e)}"
-        print(f"Full error: {e}")
-        import traceback
-        traceback.print_exc()
         history.append([message, error_msg])
         return history, history, ""
@@ -125,7 +110,7 @@ with gr.Blocks(title="Fathom R1 14B Chatbot") as demo:
     def bot_respond(hist, max_tok, temp):
         if hist and hist[-1][1] is None:
             message = hist[-1][0]
-            _, updated_hist, _ = chat_with_model(message, hist[:-1], max_tok, temp)
             return updated_hist, updated_hist
         return hist, hist

 MODEL_ID = "FractalAIResearch/Fathom-R1-14B"
+# Load model and tokenizer OUTSIDE the GPU function (following official docs)
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True
+)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+# Move to GPU (following official docs pattern)
+model.to('cuda')
+print("Model loaded and moved to GPU")
 @spaces.GPU
+def generate_response(message, history, max_tokens, temperature):
     try:
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
+        # Tokenize
         inputs = tokenizer(prompt, return_tensors="pt")
+        inputs = {k: v.to('cuda') for k, v in inputs.items()}
         # Generate
         with torch.no_grad():
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
         # Update history
         history.append([message, response])
         return history, history, ""
     except Exception as e:
         error_msg = f"Error: {str(e)}"
         history.append([message, error_msg])
         return history, history, ""
     def bot_respond(hist, max_tok, temp):
         if hist and hist[-1][1] is None:
             message = hist[-1][0]
+            _, updated_hist, _ = generate_response(message, hist[:-1], max_tok, temp)
             return updated_hist, updated_hist
         return hist, hist