AI-Talent-Force Claude Sonnet 4.5 commited on
Commit
eaa113d
·
1 Parent(s): c8d6960

Optimize inference speed and fix Spaces compatibility

Browse files

- Updated spaces to >=0.43.0 (fixes hot-reload error)
- Fixed Gradio 6.0 theme deprecation warning
- Added GPU duration=60s to keep GPU allocated between requests
- Reduced max_new_tokens from 512 to 256 for faster responses
- Limited conversation history to last 5 exchanges for speed
- Reduced tokenization max_length from 4096 to 2048
- Added use_cache=True for faster generation
- Disabled SSR mode in launch

These changes should significantly reduce response time.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +11 -9
  2. requirements.txt +1 -1
app.py CHANGED
@@ -47,7 +47,7 @@ print("🎯 CEO AI EXECUTIVE IS READY!")
47
  print("=" * 60)
48
  print("Model is loaded in memory and ready for fast inference.\n")
49
 
50
- @spaces.GPU
51
  def chat_with_ceo(message, history):
52
  """
53
  Chat function that responds like the CEO
@@ -55,9 +55,10 @@ def chat_with_ceo(message, history):
55
  message: User's current message
56
  history: List of previous messages [[user_msg, bot_msg], ...]
57
  """
58
- # Build conversation context
59
  conversation = []
60
- for user_msg, bot_msg in history:
 
61
  conversation.append({"role": "user", "content": user_msg})
62
  conversation.append({"role": "assistant", "content": bot_msg})
63
 
@@ -71,20 +72,21 @@ def chat_with_ceo(message, history):
71
  )
72
 
73
  # Tokenize
74
- inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=4096)
75
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
76
 
77
- # Generate response
78
  with torch.no_grad():
79
  outputs = model.generate(
80
  **inputs,
81
- max_new_tokens=512,
82
  temperature=0.7,
83
  top_p=0.9,
84
  do_sample=True,
85
  repetition_penalty=1.1,
86
  pad_token_id=tokenizer.pad_token_id,
87
- eos_token_id=tokenizer.eos_token_id
 
88
  )
89
 
90
  # Decode response
@@ -92,7 +94,7 @@ def chat_with_ceo(message, history):
92
  return response
93
 
94
  # Create Gradio interface
95
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
96
  gr.Markdown(
97
  """
98
  # 🎯 CEO AI Executive
@@ -159,4 +161,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
159
 
160
  if __name__ == "__main__":
161
  demo.queue()
162
- demo.launch()
 
47
  print("=" * 60)
48
  print("Model is loaded in memory and ready for fast inference.\n")
49
 
50
+ @spaces.GPU(duration=60)
51
  def chat_with_ceo(message, history):
52
  """
53
  Chat function that responds like the CEO
 
55
  message: User's current message
56
  history: List of previous messages [[user_msg, bot_msg], ...]
57
  """
58
+ # Build conversation context (limit history to last 5 exchanges for speed)
59
  conversation = []
60
+ recent_history = history[-5:] if len(history) > 5 else history
61
+ for user_msg, bot_msg in recent_history:
62
  conversation.append({"role": "user", "content": user_msg})
63
  conversation.append({"role": "assistant", "content": bot_msg})
64
 
 
72
  )
73
 
74
  # Tokenize
75
+ inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
76
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
77
 
78
+ # Generate response with optimized parameters for speed
79
  with torch.no_grad():
80
  outputs = model.generate(
81
  **inputs,
82
+ max_new_tokens=256,
83
  temperature=0.7,
84
  top_p=0.9,
85
  do_sample=True,
86
  repetition_penalty=1.1,
87
  pad_token_id=tokenizer.pad_token_id,
88
+ eos_token_id=tokenizer.eos_token_id,
89
+ use_cache=True
90
  )
91
 
92
  # Decode response
 
94
  return response
95
 
96
  # Create Gradio interface
97
+ with gr.Blocks() as demo:
98
  gr.Markdown(
99
  """
100
  # 🎯 CEO AI Executive
 
161
 
162
  if __name__ == "__main__":
163
  demo.queue()
164
+ demo.launch(share=False, ssr_mode=False)
requirements.txt CHANGED
@@ -4,6 +4,6 @@ torch==2.5.1
4
  peft==0.18.1
5
  accelerate==1.2.1
6
  safetensors==0.4.5
7
- spaces==0.30.3
8
  bitsandbytes>=0.46.1
9
  audioop-lts
 
4
  peft==0.18.1
5
  accelerate==1.2.1
6
  safetensors==0.4.5
7
+ spaces>=0.43.0
8
  bitsandbytes>=0.46.1
9
  audioop-lts