anaspro
commited on
Commit
·
3e07df2
1
Parent(s):
177c43d
Fix ZeroGPU compatibility - load model inside GPU context
Browse files- Move pipeline creation inside @spaces.GPU decorator
- Model loads on first request (lazy loading)
- Prevents loading on CPU before GPU is available
- Compatible with ZeroGPU free tier
- Model persists across requests within GPU duration
app.py
CHANGED
|
@@ -61,23 +61,15 @@ def parse_reasoning_and_instructions(system_prompt: str):
|
|
| 61 |
return effort, cleaned_instructions
|
| 62 |
|
| 63 |
# ======================================================
|
| 64 |
-
#
|
| 65 |
# ======================================================
|
| 66 |
-
logger.info("🚀 Loading GPT-OSS-20B model...")
|
| 67 |
-
|
| 68 |
model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
|
| 69 |
|
| 70 |
-
|
| 71 |
-
"text-generation",
|
| 72 |
-
model=model_id,
|
| 73 |
-
torch_dtype="auto",
|
| 74 |
-
device_map="auto",
|
| 75 |
-
trust_remote_code=True,
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
| 79 |
|
| 80 |
-
|
|
|
|
| 81 |
|
| 82 |
# ======================================================
|
| 83 |
# Format Conversation History
|
|
@@ -100,6 +92,20 @@ def format_conversation_history(chat_history):
|
|
| 100 |
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
|
| 101 |
"""Generate response using GPT-OSS with Harmony format"""
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
# Create new user message
|
| 104 |
new_message = {"role": "user", "content": input_data}
|
| 105 |
processed_history = format_conversation_history(chat_history)
|
|
|
|
| 61 |
return effort, cleaned_instructions
|
| 62 |
|
| 63 |
# ======================================================
|
| 64 |
+
# Model Configuration
|
| 65 |
# ======================================================
|
|
|
|
|
|
|
| 66 |
model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
|
| 67 |
|
| 68 |
+
# Load harmony encoding (lightweight, can load outside GPU)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
| 70 |
|
| 71 |
+
# Pipeline will be created inside @spaces.GPU function
|
| 72 |
+
pipe = None
|
| 73 |
|
| 74 |
# ======================================================
|
| 75 |
# Format Conversation History
|
|
|
|
| 92 |
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
|
| 93 |
"""Generate response using GPT-OSS with Harmony format"""
|
| 94 |
|
| 95 |
+
global pipe
|
| 96 |
+
|
| 97 |
+
# Load pipeline inside GPU context (for ZeroGPU)
|
| 98 |
+
if pipe is None:
|
| 99 |
+
logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
|
| 100 |
+
pipe = pipeline(
|
| 101 |
+
"text-generation",
|
| 102 |
+
model=model_id,
|
| 103 |
+
torch_dtype="auto",
|
| 104 |
+
device_map="auto",
|
| 105 |
+
trust_remote_code=True,
|
| 106 |
+
)
|
| 107 |
+
logger.info("✅ Model loaded successfully!")
|
| 108 |
+
|
| 109 |
# Create new user message
|
| 110 |
new_message = {"role": "user", "content": input_data}
|
| 111 |
processed_history = format_conversation_history(chat_history)
|