FractalAIR commited on
Commit
18c4432
·
verified ·
1 Parent(s): 3b68d21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -6
app.py CHANGED
@@ -10,27 +10,38 @@ def chat_with_model(message, history, max_tokens, temperature):
10
  try:
11
  print("Loading model...")
12
 
13
- # Load model and tokenizer - let ZeroGPU handle device placement
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_ID,
17
  torch_dtype=torch.bfloat16,
18
  trust_remote_code=True,
19
- device_map="auto" # Let transformers handle GPU placement
20
  )
21
 
 
 
 
 
 
 
 
22
  if tokenizer.pad_token is None:
23
  tokenizer.pad_token = tokenizer.eos_token
24
 
25
- print(f"Model loaded successfully on device: {next(model.parameters()).device}")
26
-
27
  # Simple prompt format
28
  prompt = f"User: {message}\nAssistant:"
29
 
30
- # Tokenize - let the model handle device placement
31
  inputs = tokenizer(prompt, return_tensors="pt")
 
 
32
 
33
- # Generate - the model will automatically handle device placement
 
 
34
  with torch.no_grad():
35
  outputs = model.generate(
36
  **inputs,
@@ -44,6 +55,8 @@ def chat_with_model(message, history, max_tokens, temperature):
44
  # Decode response
45
  response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
46
 
 
 
47
  # Update history
48
  history.append([message, response])
49
  return history, history, ""
 
10
  try:
11
  print("Loading model...")
12
 
13
+ # Load tokenizer
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
15
+
16
+ # Load model WITHOUT device_map to avoid CPU placement
17
  model = AutoModelForCausalLM.from_pretrained(
18
  MODEL_ID,
19
  torch_dtype=torch.bfloat16,
20
  trust_remote_code=True,
21
+ # Remove device_map="auto" - it's causing CPU placement
22
  )
23
 
24
+ # AFTER loading, move to GPU explicitly
25
+ if torch.cuda.is_available():
26
+ model = model.to('cuda')
27
+ print(f"✅ Model moved to GPU: {next(model.parameters()).device}")
28
+ else:
29
+ print("❌ No GPU available")
30
+
31
  if tokenizer.pad_token is None:
32
  tokenizer.pad_token = tokenizer.eos_token
33
 
 
 
34
  # Simple prompt format
35
  prompt = f"User: {message}\nAssistant:"
36
 
37
+ # Tokenize and move inputs to same device as model
38
  inputs = tokenizer(prompt, return_tensors="pt")
39
+ device = next(model.parameters()).device
40
+ inputs = {k: v.to(device) for k, v in inputs.items()}
41
 
42
+ print(f"✅ Inputs on device: {inputs['input_ids'].device}")
43
+
44
+ # Generate
45
  with torch.no_grad():
46
  outputs = model.generate(
47
  **inputs,
 
55
  # Decode response
56
  response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
57
 
58
+ print(f"✅ Generated: {response[:50]}...")
59
+
60
  # Update history
61
  history.append([message, response])
62
  return history, history, ""