Spaces:
Build error
Build error
Kaushik Rajan
commited on
Commit
Β·
6be63cd
1
Parent(s):
78282ae
Fix(app): resolve Hugging Face Spaces GPU startup error
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import traceback
|
|
| 13 |
import yaml
|
| 14 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 15 |
import torch
|
|
|
|
| 16 |
|
| 17 |
# Add src to path for imports
|
| 18 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -84,44 +85,17 @@ if GAMES_AVAILABLE:
|
|
| 84 |
else:
|
| 85 |
print("β All import methods failed - using fallback interface")
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
model_name = config['model']['name']
|
| 92 |
-
quantization_params = config['model'].get('quantization', {})
|
| 93 |
-
|
| 94 |
-
# Create BitsAndBytesConfig if quantization is enabled
|
| 95 |
-
if quantization_params and quantization_params.get('load_in_4bit'):
|
| 96 |
-
compute_dtype_str = quantization_params.get("bnb_4bit_compute_dtype", "float16")
|
| 97 |
-
|
| 98 |
-
if compute_dtype_str == "bfloat16":
|
| 99 |
-
compute_dtype = torch.bfloat16
|
| 100 |
-
else:
|
| 101 |
-
compute_dtype = torch.float16 # Default to float16
|
| 102 |
-
|
| 103 |
-
bnb_config = BitsAndBytesConfig(
|
| 104 |
-
load_in_4bit=True,
|
| 105 |
-
bnb_4bit_quant_type=quantization_params.get("bnb_4bit_quant_type", "nf4"),
|
| 106 |
-
bnb_4bit_compute_dtype=compute_dtype,
|
| 107 |
-
bnb_4bit_use_double_quant=quantization_params.get("bnb_4bit_use_double_quant", True),
|
| 108 |
-
)
|
| 109 |
-
# Using device_map="auto" is recommended for multi-GPU setups and large models
|
| 110 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 111 |
-
model_name,
|
| 112 |
-
quantization_config=bnb_config,
|
| 113 |
-
device_map="auto"
|
| 114 |
-
)
|
| 115 |
-
else:
|
| 116 |
-
# Fallback for no quantization
|
| 117 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
| 118 |
-
|
| 119 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 120 |
-
|
| 121 |
|
| 122 |
def generate_reasoning(prompt):
|
| 123 |
"""Generate reasoning trace using Qwen model."""
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7)
|
| 126 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 127 |
|
|
@@ -465,7 +439,64 @@ def create_interface():
|
|
| 465 |
|
| 466 |
return demo
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
demo = create_interface()
|
|
|
|
|
|
|
| 471 |
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
import yaml
|
| 14 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 15 |
import torch
|
| 16 |
+
import spaces
|
| 17 |
|
| 18 |
# Add src to path for imports
|
| 19 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
| 85 |
else:
|
| 86 |
print("β All import methods failed - using fallback interface")
|
| 87 |
|
| 88 |
+
# Initialize model and tokenizer as global variables
|
| 89 |
+
model = None
|
| 90 |
+
tokenizer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def generate_reasoning(prompt):
|
| 93 |
"""Generate reasoning trace using Qwen model."""
|
| 94 |
+
global model, tokenizer
|
| 95 |
+
if model is None or tokenizer is None:
|
| 96 |
+
return "Error: Model not loaded. Please wait for the GPU to be ready."
|
| 97 |
+
|
| 98 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 99 |
outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7)
|
| 100 |
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 101 |
|
|
|
|
| 439 |
|
| 440 |
return demo
|
| 441 |
|
| 442 |
+
@spaces.GPU(duration=300)
|
| 443 |
+
def main():
|
| 444 |
+
"""
|
| 445 |
+
Main function to load model, create interface, and launch the Gradio app.
|
| 446 |
+
Wrapped with @spaces.GPU to allocate a GPU for this Space.
|
| 447 |
+
"""
|
| 448 |
+
global model, tokenizer
|
| 449 |
|
| 450 |
+
print("π Starting main application...")
|
| 451 |
+
print("Loading configuration...")
|
| 452 |
+
with open('config.yaml', 'r') as f:
|
| 453 |
+
config = yaml.safe_load(f)
|
| 454 |
+
|
| 455 |
+
model_name = config['model']['name']
|
| 456 |
+
quantization_params = config['model'].get('quantization', {})
|
| 457 |
+
|
| 458 |
+
print(f"π¦ Model Name: {model_name}")
|
| 459 |
+
print(f"βοΈ Quantization Params: {quantization_params}")
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# Create BitsAndBytesConfig if quantization is enabled
|
| 463 |
+
if quantization_params and quantization_params.get('load_in_4bit'):
|
| 464 |
+
print("π‘ 4-bit quantization enabled. Creating BitsAndBytesConfig...")
|
| 465 |
+
compute_dtype_str = quantization_params.get("bnb_4bit_compute_dtype", "float16")
|
| 466 |
+
|
| 467 |
+
if compute_dtype_str == "bfloat16":
|
| 468 |
+
compute_dtype = torch.bfloat16
|
| 469 |
+
else:
|
| 470 |
+
compute_dtype = torch.float16 # Default to float16
|
| 471 |
+
|
| 472 |
+
bnb_config = BitsAndBytesConfig(
|
| 473 |
+
load_in_4bit=True,
|
| 474 |
+
bnb_4bit_quant_type=quantization_params.get("bnb_4bit_quant_type", "nf4"),
|
| 475 |
+
bnb_4bit_compute_dtype=compute_dtype,
|
| 476 |
+
bnb_4bit_use_double_quant=quantization_params.get("bnb_4bit_use_double_quant", True),
|
| 477 |
+
)
|
| 478 |
+
# Using device_map="auto" is recommended for multi-GPU setups and large models
|
| 479 |
+
print("π§ Loading 4-bit quantized model...")
|
| 480 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 481 |
+
model_name,
|
| 482 |
+
quantization_config=bnb_config,
|
| 483 |
+
device_map="auto"
|
| 484 |
+
)
|
| 485 |
+
else:
|
| 486 |
+
print("π§ Loading model without quantization...")
|
| 487 |
+
# Fallback for no quantization
|
| 488 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
| 489 |
+
|
| 490 |
+
print("βοΈ Loading tokenizer...")
|
| 491 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 492 |
+
|
| 493 |
+
print("β
Model and tokenizer loaded successfully.")
|
| 494 |
+
|
| 495 |
+
print("π¨ Creating Gradio interface...")
|
| 496 |
demo = create_interface()
|
| 497 |
+
|
| 498 |
+
print("π Launching Gradio app...")
|
| 499 |
demo.launch()
|
| 500 |
+
|
| 501 |
+
if __name__ == "__main__":
|
| 502 |
+
main()
|