Spaces:

AI-Talent-Force
/

exec_chatbot_v1

Paused

AI-Talent-Force Claude Sonnet 4.5 commited on 23 days ago

Commit

eaa113d

1 Parent(s): c8d6960

Optimize inference speed and fix Spaces compatibility

- Updated spaces to >=0.43.0 (fixes hot-reload error)
- Fixed Gradio 6.0 theme deprecation warning
- Added GPU duration=60s to keep GPU allocated between requests
- Reduced max_new_tokens from 512 to 256 for faster responses
- Limited conversation history to last 5 exchanges for speed
- Reduced tokenization max_length from 4096 to 2048
- Added use_cache=True for faster generation
- Disabled SSR mode in launch

These changes should significantly reduce response time.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show

app.py +11 -9
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -47,7 +47,7 @@ print("🎯 CEO AI EXECUTIVE IS READY!")
 print("=" * 60)
 print("Model is loaded in memory and ready for fast inference.\n")
-@spaces.GPU
 def chat_with_ceo(message, history):
     """
     Chat function that responds like the CEO
@@ -55,9 +55,10 @@ def chat_with_ceo(message, history):
         message: User's current message
         history: List of previous messages [[user_msg, bot_msg], ...]
     """
-    # Build conversation context
     conversation = []
-    for user_msg, bot_msg in history:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": bot_msg})
@@ -71,20 +72,21 @@ def chat_with_ceo(message, history):
     )
     # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=4096)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate response
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id
         )
     # Decode response
@@ -92,7 +94,7 @@ def chat_with_ceo(message, history):
     return response
 # Create Gradio interface
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # 🎯 CEO AI Executive
@@ -159,4 +161,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 if __name__ == "__main__":
     demo.queue()
-    demo.launch()

 print("=" * 60)
 print("Model is loaded in memory and ready for fast inference.\n")
+@spaces.GPU(duration=60)
 def chat_with_ceo(message, history):
     """
     Chat function that responds like the CEO
         message: User's current message
         history: List of previous messages [[user_msg, bot_msg], ...]
     """
+    # Build conversation context (limit history to last 5 exchanges for speed)
     conversation = []
+    recent_history = history[-5:] if len(history) > 5 else history
+    for user_msg, bot_msg in recent_history:
         conversation.append({"role": "user", "content": user_msg})
         conversation.append({"role": "assistant", "content": bot_msg})
     )
     # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate response with optimized parameters for speed
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=256,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            use_cache=True
         )
     # Decode response
     return response
 # Create Gradio interface
+with gr.Blocks() as demo:
     gr.Markdown(
         """
         # 🎯 CEO AI Executive
 if __name__ == "__main__":
     demo.queue()
+    demo.launch(share=False, ssr_mode=False)

requirements.txt CHANGED Viewed

@@ -4,6 +4,6 @@ torch==2.5.1
 peft==0.18.1
 accelerate==1.2.1
 safetensors==0.4.5
-spaces==0.30.3
 bitsandbytes>=0.46.1
 audioop-lts

 peft==0.18.1
 accelerate==1.2.1
 safetensors==0.4.5
+spaces>=0.43.0
 bitsandbytes>=0.46.1
 audioop-lts