AI-Talent-Force Claude Sonnet 4.5 commited on
Commit
6fdb30f
·
1 Parent(s): 77419e1

Load model once at startup instead of per query

Browse files

- Removed @spaces.GPU decorator from load_model function
- Model now loads at module level (startup) instead of per request
- This should drastically reduce response time after initial load
- Queries should be instant instead of taking 2+ minutes each

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +25 -32
app.py CHANGED
@@ -8,39 +8,32 @@ import spaces
8
  BASE_MODEL = "unsloth/qwen3-30b-a3b"
9
  LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
10
 
11
- # Load model and tokenizer
12
- @spaces.GPU
13
- def load_model():
14
- """Load the base model and apply LoRA adapter"""
15
- print("Loading tokenizer...")
16
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
17
-
18
- print("Loading base model...")
19
- # Use 4-bit quantization to fit in GPU memory
20
- quantization_config = BitsAndBytesConfig(
21
- load_in_4bit=True,
22
- bnb_4bit_compute_dtype=torch.bfloat16,
23
- bnb_4bit_use_double_quant=True,
24
- bnb_4bit_quant_type="nf4"
25
- )
26
-
27
- model = AutoModelForCausalLM.from_pretrained(
28
- BASE_MODEL,
29
- quantization_config=quantization_config,
30
- device_map="auto",
31
- trust_remote_code=True
32
- )
33
-
34
- print("Loading LoRA adapter...")
35
- model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
36
- model.eval()
37
-
38
- print("Model loaded successfully!")
39
- return model, tokenizer
40
-
41
- # Initialize model and tokenizer
42
  print("Initializing CEO AI Executive...")
43
- model, tokenizer = load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  @spaces.GPU
46
  def chat_with_ceo(message, history):
 
8
  BASE_MODEL = "unsloth/qwen3-30b-a3b"
9
  LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
10
 
11
+ # Load model and tokenizer at startup (once)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  print("Initializing CEO AI Executive...")
13
+ print("Loading tokenizer...")
14
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
15
+
16
+ print("Loading base model...")
17
+ # Use 4-bit quantization to fit in GPU memory
18
+ quantization_config = BitsAndBytesConfig(
19
+ load_in_4bit=True,
20
+ bnb_4bit_compute_dtype=torch.bfloat16,
21
+ bnb_4bit_use_double_quant=True,
22
+ bnb_4bit_quant_type="nf4"
23
+ )
24
+
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ BASE_MODEL,
27
+ quantization_config=quantization_config,
28
+ device_map="auto",
29
+ trust_remote_code=True
30
+ )
31
+
32
+ print("Loading LoRA adapter...")
33
+ model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
34
+ model.eval()
35
+
36
+ print("Model loaded successfully!")
37
 
38
  @spaces.GPU
39
  def chat_with_ceo(message, history):