Kaushik Rajan commited on
Commit
6be63cd
Β·
1 Parent(s): 78282ae

Fix(app): resolve Hugging Face Spaces GPU startup error

Browse files
Files changed (1) hide show
  1. app.py +67 -36
app.py CHANGED
@@ -13,6 +13,7 @@ import traceback
13
  import yaml
14
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
15
  import torch
 
16
 
17
  # Add src to path for imports
18
  current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -84,44 +85,17 @@ if GAMES_AVAILABLE:
84
  else:
85
  print("❌ All import methods failed - using fallback interface")
86
 
87
-
88
- with open('config.yaml', 'r') as f:
89
- config = yaml.safe_load(f)
90
-
91
- model_name = config['model']['name']
92
- quantization_params = config['model'].get('quantization', {})
93
-
94
- # Create BitsAndBytesConfig if quantization is enabled
95
- if quantization_params and quantization_params.get('load_in_4bit'):
96
- compute_dtype_str = quantization_params.get("bnb_4bit_compute_dtype", "float16")
97
-
98
- if compute_dtype_str == "bfloat16":
99
- compute_dtype = torch.bfloat16
100
- else:
101
- compute_dtype = torch.float16 # Default to float16
102
-
103
- bnb_config = BitsAndBytesConfig(
104
- load_in_4bit=True,
105
- bnb_4bit_quant_type=quantization_params.get("bnb_4bit_quant_type", "nf4"),
106
- bnb_4bit_compute_dtype=compute_dtype,
107
- bnb_4bit_use_double_quant=quantization_params.get("bnb_4bit_use_double_quant", True),
108
- )
109
- # Using device_map="auto" is recommended for multi-GPU setups and large models
110
- model = AutoModelForCausalLM.from_pretrained(
111
- model_name,
112
- quantization_config=bnb_config,
113
- device_map="auto"
114
- )
115
- else:
116
- # Fallback for no quantization
117
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
118
-
119
- tokenizer = AutoTokenizer.from_pretrained(model_name)
120
-
121
 
122
  def generate_reasoning(prompt):
123
  """Generate reasoning trace using Qwen model."""
124
- inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
 
125
  outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7)
126
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
127
 
@@ -465,7 +439,64 @@ def create_interface():
465
 
466
  return demo
467
 
 
 
 
 
 
 
 
468
 
469
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  demo = create_interface()
 
 
471
  demo.launch()
 
 
 
 
13
  import yaml
14
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
15
  import torch
16
+ import spaces
17
 
18
  # Add src to path for imports
19
  current_dir = os.path.dirname(os.path.abspath(__file__))
 
85
  else:
86
  print("❌ All import methods failed - using fallback interface")
87
 
88
+ # Initialize model and tokenizer as global variables
89
+ model = None
90
+ tokenizer = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def generate_reasoning(prompt):
93
  """Generate reasoning trace using Qwen model."""
94
+ global model, tokenizer
95
+ if model is None or tokenizer is None:
96
+ return "Error: Model not loaded. Please wait for the GPU to be ready."
97
+
98
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
99
  outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7)
100
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
101
 
 
439
 
440
  return demo
441
 
442
+ @spaces.GPU(duration=300)
443
+ def main():
444
+ """
445
+ Main function to load model, create interface, and launch the Gradio app.
446
+ Wrapped with @spaces.GPU to allocate a GPU for this Space.
447
+ """
448
+ global model, tokenizer
449
 
450
+ print("πŸš€ Starting main application...")
451
+ print("Loading configuration...")
452
+ with open('config.yaml', 'r') as f:
453
+ config = yaml.safe_load(f)
454
+
455
+ model_name = config['model']['name']
456
+ quantization_params = config['model'].get('quantization', {})
457
+
458
+ print(f"πŸ“¦ Model Name: {model_name}")
459
+ print(f"βš™οΈ Quantization Params: {quantization_params}")
460
+
461
+
462
+ # Create BitsAndBytesConfig if quantization is enabled
463
+ if quantization_params and quantization_params.get('load_in_4bit'):
464
+ print("πŸ’‘ 4-bit quantization enabled. Creating BitsAndBytesConfig...")
465
+ compute_dtype_str = quantization_params.get("bnb_4bit_compute_dtype", "float16")
466
+
467
+ if compute_dtype_str == "bfloat16":
468
+ compute_dtype = torch.bfloat16
469
+ else:
470
+ compute_dtype = torch.float16 # Default to float16
471
+
472
+ bnb_config = BitsAndBytesConfig(
473
+ load_in_4bit=True,
474
+ bnb_4bit_quant_type=quantization_params.get("bnb_4bit_quant_type", "nf4"),
475
+ bnb_4bit_compute_dtype=compute_dtype,
476
+ bnb_4bit_use_double_quant=quantization_params.get("bnb_4bit_use_double_quant", True),
477
+ )
478
+ # Using device_map="auto" is recommended for multi-GPU setups and large models
479
+ print("🧠 Loading 4-bit quantized model...")
480
+ model = AutoModelForCausalLM.from_pretrained(
481
+ model_name,
482
+ quantization_config=bnb_config,
483
+ device_map="auto"
484
+ )
485
+ else:
486
+ print("🧠 Loading model without quantization...")
487
+ # Fallback for no quantization
488
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
489
+
490
+ print("βœ’οΈ Loading tokenizer...")
491
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
492
+
493
+ print("βœ… Model and tokenizer loaded successfully.")
494
+
495
+ print("🎨 Creating Gradio interface...")
496
  demo = create_interface()
497
+
498
+ print("πŸš€ Launching Gradio app...")
499
  demo.launch()
500
+
501
+ if __name__ == "__main__":
502
+ main()