Spaces:

anaspro
/

chatbox

Runtime error

anaspro commited on Oct 21, 2025

Commit

3e07df2

1 Parent(s): 177c43d

Fix ZeroGPU compatibility - load model inside GPU context

- Move pipeline creation inside @spaces.GPU decorator
- Model loads on first request (lazy loading)
- Prevents loading on CPU before GPU is available
- Compatible with ZeroGPU free tier
- Model persists across requests within GPU duration

Files changed (1) hide show

app.py +18 -12

app.py CHANGED Viewed

@@ -61,23 +61,15 @@ def parse_reasoning_and_instructions(system_prompt: str):
     return effort, cleaned_instructions
 # ======================================================
-# Load Model and Harmony Encoding
 # ======================================================
-logger.info("🚀 Loading GPT-OSS-20B model...")
 model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
-pipe = pipeline(
-    "text-generation",
-    model=model_id,
-    torch_dtype="auto",
-    device_map="auto",
-    trust_remote_code=True,
-)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-logger.info("✅ Model and harmony encoding loaded successfully!")
 # ======================================================
 # Format Conversation History
@@ -100,6 +92,20 @@ def format_conversation_history(chat_history):
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     """Generate response using GPT-OSS with Harmony format"""
     # Create new user message
     new_message = {"role": "user", "content": input_data}
     processed_history = format_conversation_history(chat_history)

     return effort, cleaned_instructions
 # ======================================================
+# Model Configuration
 # ======================================================
 model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
+# Load harmony encoding (lightweight, can load outside GPU)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+# Pipeline will be created inside @spaces.GPU function
+pipe = None
 # ======================================================
 # Format Conversation History
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     """Generate response using GPT-OSS with Harmony format"""
+    global pipe
+    # Load pipeline inside GPU context (for ZeroGPU)
+    if pipe is None:
+        logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
+        pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            torch_dtype="auto",
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        logger.info("✅ Model loaded successfully!")
     # Create new user message
     new_message = {"role": "user", "content": input_data}
     processed_history = format_conversation_history(chat_history)