Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

01a04bc

1 Parent(s): 0331461

Force all CUDA operations to cuda:0 and use device_map to prevent multi-GPU distribution

Browse files

Files changed (2) hide show

app.py +12 -8
gradio_app.py +6 -6

app.py CHANGED Viewed

@@ -59,8 +59,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
             if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
-                    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                    device_map=None  # Don't use auto device mapping to avoid multi-GPU issues,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     token=hf_token
@@ -68,8 +68,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
             else:
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
-                    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                    device_map=None  # Don't use auto device mapping to avoid multi-GPU issues,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
@@ -95,10 +95,14 @@ async def load_model():
         logger.info("Starting model loading...")
         # Check if CUDA is available
-        device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
-        if device == "cuda":
             logger.info(f"GPU: {torch.cuda.get_device_name()}")
             logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
@@ -117,7 +121,7 @@ async def load_model():
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
-            if device == "cuda":
                 model = model.to(device)
             logger.info("Model loaded successfully with transformers!")
@@ -274,7 +278,7 @@ async def generate_questions(request: QuestionGenerationRequest):
         # Generate response using transformers
         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        if device == "cuda":
             inputs = inputs.to(device)
             # Ensure all model parameters are on the same device
             if model is not None:

             if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
+                    torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
+                    device_map={"": 0}  # Force all parameters to GPU 0,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     token=hf_token
             else:
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
+                    torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
+                    device_map={"": 0}  # Force all parameters to GPU 0,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
         logger.info("Starting model loading...")
         # Check if CUDA is available
+        if torch.cuda.is_available():
+        torch.cuda.set_device(0)
+        device = "cuda:0"
+    else:
+        device = "cpu"
         logger.info(f"Using device: {device}")
+        if device == "cuda:0":
             logger.info(f"GPU: {torch.cuda.get_device_name()}")
             logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
+            if device == "cuda:0":
                 model = model.to(device)
             logger.info("Model loaded successfully with transformers!")
         # Generate response using transformers
         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        if device == "cuda:0":
             inputs = inputs.to(device)
             # Ensure all model parameters are on the same device
             if model is not None:

gradio_app.py CHANGED Viewed

@@ -33,10 +33,10 @@ class ModelManager:
             logger.info("Starting model loading...")
             # Check if CUDA is available
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Using device: {self.device}")
-            if self.device == "cuda":
                 logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
                 logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
@@ -55,14 +55,14 @@ class ModelManager:
             self.model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
-                device_map="auto" if self.device == "cuda" else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 token=hf_token
             )
-            if self.device == "cuda":
                 self.model = self.model.to(self.device)
             self.model_loaded = True
@@ -100,7 +100,7 @@ Questions:"""
             # Generate questions
             inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-            if self.device == "cuda":
                 inputs = inputs.to(self.device)
                 # Ensure all model parameters are on the same device
                 if hasattr(self.model, 'device'):

             logger.info("Starting model loading...")
             # Check if CUDA is available
+            self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
             logger.info(f"Using device: {self.device}")
+            if self.device == "cuda:0":
                 logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
                 logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
             self.model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
+                torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
+                device_map="auto" if self.device == "cuda:0" else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 token=hf_token
             )
+            if self.device == "cuda:0":
                 self.model = self.model.to(self.device)
             self.model_loaded = True
             # Generate questions
             inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+            if self.device == "cuda:0":
                 inputs = inputs.to(self.device)
                 # Ensure all model parameters are on the same device
                 if hasattr(self.model, 'device'):