david167 commited on
Commit
01a04bc
·
1 Parent(s): 0331461

Force all CUDA operations to cuda:0 and use device_map to prevent multi-GPU distribution

Browse files
Files changed (2) hide show
  1. app.py +12 -8
  2. gradio_app.py +6 -6
app.py CHANGED
@@ -59,8 +59,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
59
  if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
62
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
63
- device_map=None # Don't use auto device mapping to avoid multi-GPU issues,
64
  trust_remote_code=True,
65
  low_cpu_mem_usage=True,
66
  token=hf_token
@@ -68,8 +68,8 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
68
  else:
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_name,
71
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
72
- device_map=None # Don't use auto device mapping to avoid multi-GPU issues,
73
  trust_remote_code=True,
74
  low_cpu_mem_usage=True,
75
  use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
@@ -95,10 +95,14 @@ async def load_model():
95
  logger.info("Starting model loading...")
96
 
97
  # Check if CUDA is available
98
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
99
  logger.info(f"Using device: {device}")
100
 
101
- if device == "cuda":
102
  logger.info(f"GPU: {torch.cuda.get_device_name()}")
103
  logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
104
 
@@ -117,7 +121,7 @@ async def load_model():
117
 
118
  tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
119
 
120
- if device == "cuda":
121
  model = model.to(device)
122
 
123
  logger.info("Model loaded successfully with transformers!")
@@ -274,7 +278,7 @@ async def generate_questions(request: QuestionGenerationRequest):
274
 
275
  # Generate response using transformers
276
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
277
- if device == "cuda":
278
  inputs = inputs.to(device)
279
  # Ensure all model parameters are on the same device
280
  if model is not None:
 
59
  if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
62
+ torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
63
+ device_map={"": 0} # Force all parameters to GPU 0,
64
  trust_remote_code=True,
65
  low_cpu_mem_usage=True,
66
  token=hf_token
 
68
  else:
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_name,
71
+ torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
72
+ device_map={"": 0} # Force all parameters to GPU 0,
73
  trust_remote_code=True,
74
  low_cpu_mem_usage=True,
75
  use_safetensors=True, # Force safetensors to avoid CVE-2025-32434
 
95
  logger.info("Starting model loading...")
96
 
97
  # Check if CUDA is available
98
+ if torch.cuda.is_available():
99
+ torch.cuda.set_device(0)
100
+ device = "cuda:0"
101
+ else:
102
+ device = "cpu"
103
  logger.info(f"Using device: {device}")
104
 
105
+ if device == "cuda:0":
106
  logger.info(f"GPU: {torch.cuda.get_device_name()}")
107
  logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
108
 
 
121
 
122
  tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
123
 
124
+ if device == "cuda:0":
125
  model = model.to(device)
126
 
127
  logger.info("Model loaded successfully with transformers!")
 
278
 
279
  # Generate response using transformers
280
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
281
+ if device == "cuda:0":
282
  inputs = inputs.to(device)
283
  # Ensure all model parameters are on the same device
284
  if model is not None:
gradio_app.py CHANGED
@@ -33,10 +33,10 @@ class ModelManager:
33
  logger.info("Starting model loading...")
34
 
35
  # Check if CUDA is available
36
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
37
  logger.info(f"Using device: {self.device}")
38
 
39
- if self.device == "cuda":
40
  logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
41
  logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
42
 
@@ -55,14 +55,14 @@ class ModelManager:
55
 
56
  self.model = AutoModelForCausalLM.from_pretrained(
57
  base_model_name,
58
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
59
- device_map="auto" if self.device == "cuda" else None,
60
  trust_remote_code=True,
61
  low_cpu_mem_usage=True,
62
  token=hf_token
63
  )
64
 
65
- if self.device == "cuda":
66
  self.model = self.model.to(self.device)
67
 
68
  self.model_loaded = True
@@ -100,7 +100,7 @@ Questions:"""
100
 
101
  # Generate questions
102
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
103
- if self.device == "cuda":
104
  inputs = inputs.to(self.device)
105
  # Ensure all model parameters are on the same device
106
  if hasattr(self.model, 'device'):
 
33
  logger.info("Starting model loading...")
34
 
35
  # Check if CUDA is available
36
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
37
  logger.info(f"Using device: {self.device}")
38
 
39
+ if self.device == "cuda:0":
40
  logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
41
  logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
42
 
 
55
 
56
  self.model = AutoModelForCausalLM.from_pretrained(
57
  base_model_name,
58
+ torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
59
+ device_map="auto" if self.device == "cuda:0" else None,
60
  trust_remote_code=True,
61
  low_cpu_mem_usage=True,
62
  token=hf_token
63
  )
64
 
65
+ if self.device == "cuda:0":
66
  self.model = self.model.to(self.device)
67
 
68
  self.model_loaded = True
 
100
 
101
  # Generate questions
102
  inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
103
+ if self.device == "cuda:0":
104
  inputs = inputs.to(self.device)
105
  # Ensure all model parameters are on the same device
106
  if hasattr(self.model, 'device'):