Spaces:

anaspro
/

chatbox

Runtime error

anaspro commited on Oct 21, 2025

Commit

a645494

1 Parent(s): 3e07df2

Add model caching with lru_cache for ZeroGPU

- Use @lru_cache to cache loaded model
- Prevents reloading model on every request
- First load: ~18 seconds
- Subsequent loads: instant (cached)
- Works better with ZeroGPU ephemeral processes

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import spaces
 import re
 from threading import Thread
 from transformers import pipeline, TextIteratorStreamer
 from huggingface_hub import login
 import logging
@@ -68,8 +69,22 @@ model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
 # Load harmony encoding (lightweight, can load outside GPU)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-# Pipeline will be created inside @spaces.GPU function
-pipe = None
 # ======================================================
 # Format Conversation History
@@ -92,19 +107,8 @@ def format_conversation_history(chat_history):
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     """Generate response using GPT-OSS with Harmony format"""
-    global pipe
-    # Load pipeline inside GPU context (for ZeroGPU)
-    if pipe is None:
-        logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
-        pipe = pipeline(
-            "text-generation",
-            model=model_id,
-            torch_dtype="auto",
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        logger.info("✅ Model loaded successfully!")
     # Create new user message
     new_message = {"role": "user", "content": input_data}

 import spaces
 import re
 from threading import Thread
+from functools import lru_cache
 from transformers import pipeline, TextIteratorStreamer
 from huggingface_hub import login
 import logging
 # Load harmony encoding (lightweight, can load outside GPU)
 enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+# ======================================================
+# Cached Model Loader (for ZeroGPU)
+# ======================================================
+@lru_cache(maxsize=1)
+def load_model():
+    """Load model with caching to avoid reloading"""
+    logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
+    model_pipe = pipeline(
+        "text-generation",
+        model=model_id,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    logger.info("✅ Model loaded successfully!")
+    return model_pipe
 # ======================================================
 # Format Conversation History
 def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
     """Generate response using GPT-OSS with Harmony format"""
+    # Get cached model (loads only once)
+    pipe = load_model()
     # Create new user message
     new_message = {"role": "user", "content": input_data}