Spaces:

Staticaliza
/

Voice

Paused

Staticaliza commited on Nov 2, 2024

Commit

944743c

verified ·

1 Parent(s): 3a7347e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import torch
 from transformers import TextIteratorStreamer
 import threading
 class ModelWrapper:
     def __init__(self):
@@ -9,16 +12,18 @@ class ModelWrapper:
     @spaces.GPU
     def generate(self, prompt):
         if self.model is None:
-            # Load the model when GPU is allocated
             self.model = AutoGPTQForCausalLM.from_quantized(
                 model_id,
-                device_map='auto',
                 trust_remote_code=True,
             )
-            self.model.eval()
         # Tokenize the input prompt
         inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
         # Set up the streamer
         streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

 import torch
 from transformers import TextIteratorStreamer
 import threading
+import spaces
+print("Is CUDA available?", torch.cuda.is_available())
 class ModelWrapper:
     def __init__(self):
     @spaces.GPU
     def generate(self, prompt):
         if self.model is None:
+            # Explicitly set device_map to 'cuda'
             self.model = AutoGPTQForCausalLM.from_quantized(
                 model_id,
+                device_map={'': 'cuda:0'},
                 trust_remote_code=True,
             )
+        print("Model is on device:", next(self.model.parameters()).device)
         # Tokenize the input prompt
         inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
+        print("Inputs are on device:", inputs['input_ids'].device)
         # Set up the streamer
         streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)