Spaces:
Sleeping
Sleeping
Force all CUDA operations to cuda:0 and use device_map to prevent multi-GPU distribution
Browse files- app.py +12 -8
- gradio_app.py +6 -6
app.py
CHANGED
|
@@ -59,8 +59,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
|
|
| 59 |
if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(
|
| 61 |
model_name,
|
| 62 |
-
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
| 63 |
-
device_map=
|
| 64 |
trust_remote_code=True,
|
| 65 |
low_cpu_mem_usage=True,
|
| 66 |
token=hf_token
|
|
@@ -68,8 +68,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
|
|
| 68 |
else:
|
| 69 |
model = AutoModelForCausalLM.from_pretrained(
|
| 70 |
model_name,
|
| 71 |
-
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
| 72 |
-
device_map=
|
| 73 |
trust_remote_code=True,
|
| 74 |
low_cpu_mem_usage=True,
|
| 75 |
use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
|
|
@@ -95,10 +95,14 @@ async def load_model():
|
|
| 95 |
logger.info("Starting model loading...")
|
| 96 |
|
| 97 |
# Check if CUDA is available
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
logger.info(f"Using device: {device}")
|
| 100 |
|
| 101 |
-
if device == "cuda":
|
| 102 |
logger.info(f"GPU: {torch.cuda.get_device_name()}")
|
| 103 |
logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
| 104 |
|
|
@@ -117,7 +121,7 @@ async def load_model():
|
|
| 117 |
|
| 118 |
tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
|
| 119 |
|
| 120 |
-
if device == "cuda":
|
| 121 |
model = model.to(device)
|
| 122 |
|
| 123 |
logger.info("Model loaded successfully with transformers!")
|
|
@@ -274,7 +278,7 @@ async def generate_questions(request: QuestionGenerationRequest):
|
|
| 274 |
|
| 275 |
# Generate response using transformers
|
| 276 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
| 277 |
-
if device == "cuda":
|
| 278 |
inputs = inputs.to(device)
|
| 279 |
# Ensure all model parameters are on the same device
|
| 280 |
if model is not None:
|
|
|
|
| 59 |
if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(
|
| 61 |
model_name,
|
| 62 |
+
torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
|
| 63 |
+
device_map={"": 0} # Force all parameters to GPU 0,
|
| 64 |
trust_remote_code=True,
|
| 65 |
low_cpu_mem_usage=True,
|
| 66 |
token=hf_token
|
|
|
|
| 68 |
else:
|
| 69 |
model = AutoModelForCausalLM.from_pretrained(
|
| 70 |
model_name,
|
| 71 |
+
torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
|
| 72 |
+
device_map={"": 0} # Force all parameters to GPU 0,
|
| 73 |
trust_remote_code=True,
|
| 74 |
low_cpu_mem_usage=True,
|
| 75 |
use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
|
|
|
|
| 95 |
logger.info("Starting model loading...")
|
| 96 |
|
| 97 |
# Check if CUDA is available
|
| 98 |
+
if torch.cuda.is_available():
|
| 99 |
+
torch.cuda.set_device(0)
|
| 100 |
+
device = "cuda:0"
|
| 101 |
+
else:
|
| 102 |
+
device = "cpu"
|
| 103 |
logger.info(f"Using device: {device}")
|
| 104 |
|
| 105 |
+
if device == "cuda:0":
|
| 106 |
logger.info(f"GPU: {torch.cuda.get_device_name()}")
|
| 107 |
logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
| 108 |
|
|
|
|
| 121 |
|
| 122 |
tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
|
| 123 |
|
| 124 |
+
if device == "cuda:0":
|
| 125 |
model = model.to(device)
|
| 126 |
|
| 127 |
logger.info("Model loaded successfully with transformers!")
|
|
|
|
| 278 |
|
| 279 |
# Generate response using transformers
|
| 280 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
| 281 |
+
if device == "cuda:0":
|
| 282 |
inputs = inputs.to(device)
|
| 283 |
# Ensure all model parameters are on the same device
|
| 284 |
if model is not None:
|
gradio_app.py
CHANGED
|
@@ -33,10 +33,10 @@ class ModelManager:
|
|
| 33 |
logger.info("Starting model loading...")
|
| 34 |
|
| 35 |
# Check if CUDA is available
|
| 36 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
logger.info(f"Using device: {self.device}")
|
| 38 |
|
| 39 |
-
if self.device == "cuda":
|
| 40 |
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 41 |
logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
| 42 |
|
|
@@ -55,14 +55,14 @@ class ModelManager:
|
|
| 55 |
|
| 56 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 57 |
base_model_name,
|
| 58 |
-
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
|
| 59 |
-
device_map="auto" if self.device == "cuda" else None,
|
| 60 |
trust_remote_code=True,
|
| 61 |
low_cpu_mem_usage=True,
|
| 62 |
token=hf_token
|
| 63 |
)
|
| 64 |
|
| 65 |
-
if self.device == "cuda":
|
| 66 |
self.model = self.model.to(self.device)
|
| 67 |
|
| 68 |
self.model_loaded = True
|
|
@@ -100,7 +100,7 @@ Questions:"""
|
|
| 100 |
|
| 101 |
# Generate questions
|
| 102 |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
| 103 |
-
if self.device == "cuda":
|
| 104 |
inputs = inputs.to(self.device)
|
| 105 |
# Ensure all model parameters are on the same device
|
| 106 |
if hasattr(self.model, 'device'):
|
|
|
|
| 33 |
logger.info("Starting model loading...")
|
| 34 |
|
| 35 |
# Check if CUDA is available
|
| 36 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 37 |
logger.info(f"Using device: {self.device}")
|
| 38 |
|
| 39 |
+
if self.device == "cuda:0":
|
| 40 |
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 41 |
logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
|
| 42 |
|
|
|
|
| 55 |
|
| 56 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 57 |
base_model_name,
|
| 58 |
+
torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
|
| 59 |
+
device_map="auto" if self.device == "cuda:0" else None,
|
| 60 |
trust_remote_code=True,
|
| 61 |
low_cpu_mem_usage=True,
|
| 62 |
token=hf_token
|
| 63 |
)
|
| 64 |
|
| 65 |
+
if self.device == "cuda:0":
|
| 66 |
self.model = self.model.to(self.device)
|
| 67 |
|
| 68 |
self.model_loaded = True
|
|
|
|
| 100 |
|
| 101 |
# Generate questions
|
| 102 |
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
| 103 |
+
if self.device == "cuda:0":
|
| 104 |
inputs = inputs.to(self.device)
|
| 105 |
# Ensure all model parameters are on the same device
|
| 106 |
if hasattr(self.model, 'device'):
|