Spaces:

AI-Talent-Force
/

exec_chatbot_v1

Paused

AI-Talent-Force Claude Sonnet 4.5 commited on 23 days ago

Commit

6fdb30f

1 Parent(s): 77419e1

Load model once at startup instead of per query

- Removed @spaces.GPU decorator from load_model function
- Model now loads at module level (startup) instead of per request
- This should drastically reduce response time after initial load
- Queries should be instant instead of taking 2+ minutes each

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (1) hide show

app.py +25 -32

app.py CHANGED Viewed

@@ -8,39 +8,32 @@ import spaces
 BASE_MODEL = "unsloth/qwen3-30b-a3b"
 LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
-# Load model and tokenizer
-@spaces.GPU
-def load_model():
-    """Load the base model and apply LoRA adapter"""
-    print("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-    print("Loading base model...")
-    # Use 4-bit quantization to fit in GPU memory
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4"
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        quantization_config=quantization_config,
-        device_map="auto",
-        trust_remote_code=True
-    )
-    print("Loading LoRA adapter...")
-    model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
-    model.eval()
-    print("Model loaded successfully!")
-    return model, tokenizer
-# Initialize model and tokenizer
 print("Initializing CEO AI Executive...")
-model, tokenizer = load_model()
 @spaces.GPU
 def chat_with_ceo(message, history):

 BASE_MODEL = "unsloth/qwen3-30b-a3b"
 LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"
+# Load model and tokenizer at startup (once)
 print("Initializing CEO AI Executive...")
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+print("Loading base model...")
+# Use 4-bit quantization to fit in GPU memory
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=quantization_config,
+    device_map="auto",
+    trust_remote_code=True
+)
+print("Loading LoRA adapter...")
+model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
+model.eval()
+print("Model loaded successfully!")
 @spaces.GPU
 def chat_with_ceo(message, history):