anaspro commited on
Commit
3e07df2
·
1 Parent(s): 177c43d

Fix ZeroGPU compatibility - load model inside GPU context

Browse files

- Move pipeline creation inside @spaces.GPU decorator
- Model loads on first request (lazy loading)
- Prevents loading on CPU before GPU is available
- Compatible with ZeroGPU free tier
- Model persists across requests within GPU duration

Files changed (1) hide show
  1. app.py +18 -12
app.py CHANGED
@@ -61,23 +61,15 @@ def parse_reasoning_and_instructions(system_prompt: str):
61
  return effort, cleaned_instructions
62
 
63
  # ======================================================
64
- # Load Model and Harmony Encoding
65
  # ======================================================
66
- logger.info("🚀 Loading GPT-OSS-20B model...")
67
-
68
  model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
69
 
70
- pipe = pipeline(
71
- "text-generation",
72
- model=model_id,
73
- torch_dtype="auto",
74
- device_map="auto",
75
- trust_remote_code=True,
76
- )
77
-
78
  enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
79
 
80
- logger.info("✅ Model and harmony encoding loaded successfully!")
 
81
 
82
  # ======================================================
83
  # Format Conversation History
@@ -100,6 +92,20 @@ def format_conversation_history(chat_history):
100
  def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
101
  """Generate response using GPT-OSS with Harmony format"""
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Create new user message
104
  new_message = {"role": "user", "content": input_data}
105
  processed_history = format_conversation_history(chat_history)
 
61
  return effort, cleaned_instructions
62
 
63
  # ======================================================
64
+ # Model Configuration
65
  # ======================================================
 
 
66
  model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
67
 
68
+ # Load harmony encoding (lightweight, can load outside GPU)
 
 
 
 
 
 
 
69
  enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
70
 
71
+ # Pipeline will be created inside @spaces.GPU function
72
+ pipe = None
73
 
74
  # ======================================================
75
  # Format Conversation History
 
92
  def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
93
  """Generate response using GPT-OSS with Harmony format"""
94
 
95
+ global pipe
96
+
97
+ # Load pipeline inside GPU context (for ZeroGPU)
98
+ if pipe is None:
99
+ logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
100
+ pipe = pipeline(
101
+ "text-generation",
102
+ model=model_id,
103
+ torch_dtype="auto",
104
+ device_map="auto",
105
+ trust_remote_code=True,
106
+ )
107
+ logger.info("✅ Model loaded successfully!")
108
+
109
  # Create new user message
110
  new_message = {"role": "user", "content": input_data}
111
  processed_history = format_conversation_history(chat_history)