Fathom

Sleeping

App Files Files Community

FractalAIR commited on Aug 25

Commit

18c4432

verified ·

1 Parent(s): 3b68d21

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -6

app.py CHANGED Viewed

@@ -10,27 +10,38 @@ def chat_with_model(message, history, max_tokens, temperature):
     try:
         print("Loading model...")
-        # Load model and tokenizer - let ZeroGPU handle device placement
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
-            device_map="auto"  # Let transformers handle GPU placement
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        print(f"Model loaded successfully on device: {next(model.parameters()).device}")
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
-        # Tokenize - let the model handle device placement
         inputs = tokenizer(prompt, return_tensors="pt")
-        # Generate - the model will automatically handle device placement
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -44,6 +55,8 @@ def chat_with_model(message, history, max_tokens, temperature):
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
         # Update history
         history.append([message, response])
         return history, history, ""

     try:
         print("Loading model...")
+        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+        # Load model WITHOUT device_map to avoid CPU placement
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
             trust_remote_code=True,
+            # Remove device_map="auto" - it's causing CPU placement
         )
+        # AFTER loading, move to GPU explicitly
+        if torch.cuda.is_available():
+            model = model.to('cuda')
+            print(f"✅ Model moved to GPU: {next(model.parameters()).device}")
+        else:
+            print("❌ No GPU available")
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         # Simple prompt format
         prompt = f"User: {message}\nAssistant:"
+        # Tokenize and move inputs to same device as model
         inputs = tokenizer(prompt, return_tensors="pt")
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        print(f"✅ Inputs on device: {inputs['input_ids'].device}")
+        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
         # Decode response
         response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
+        print(f"✅ Generated: {response[:50]}...")
         # Update history
         history.append([message, response])
         return history, history, ""