Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3, 2025

Commit

387c509

1 Parent(s): 8c02af0

fixes

Browse files

Files changed (1) hide show

app.py +76 -9

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import time
 import json
 import requests
 from huggingface_hub.errors import HfHubHTTPError
 """
@@ -25,6 +26,55 @@ else:
 API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
 headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 def respond(
     message,
@@ -34,6 +84,12 @@ def respond(
     temperature,
     top_p,
 ):
     messages = [{"role": "system", "content": system_message}]
     for val in history:
@@ -50,7 +106,7 @@ def respond(
     print(f"Sending messages: {json.dumps(messages, indent=2)}")
     # Try to initialize the model with retries
-    max_retries = 3
     retry_count = 0
     # Try both methods: InferenceClient and direct API call
@@ -68,12 +124,15 @@ def respond(
                     stream=True,
                     temperature=temperature,
                     top_p=top_p,
                 ):
                     token = message.choices[0].delta.content
                     if token:
                         response += token
                         yield response
                 # If we got here, we were successful
                 break
             else:
                 # Method 2: Direct API call
@@ -88,7 +147,7 @@ def respond(
                 }
                 print(f"Making direct API call to {API_URL}")
-                api_response = requests.post(API_URL, headers=headers, json=payload)
                 print(f"API response status: {api_response.status_code}")
                 if api_response.status_code == 200:
@@ -97,6 +156,7 @@ def respond(
                     if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
                         response = result[0]["generated_text"]
                         yield response
                         break
                     else:
                         print(f"Unexpected API response format: {result}")
@@ -105,8 +165,9 @@ def respond(
                     print(f"API error: {api_response.text}")
                     if api_response.status_code == 504 and retry_count < max_retries - 1:
                         retry_count += 1
                         yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
-                        time.sleep(10)
                     else:
                         yield f"❌ API error: {api_response.status_code} - {api_response.text}"
                         break
@@ -118,15 +179,16 @@ def respond(
             if "504 Server Error: Gateway Timeout" in error_message:
                 if retry_count < max_retries - 1:
-                    wait_time = 10  # seconds
                     print(f"Model timed out. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
                     yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                     time.sleep(wait_time)
-                    # Try direct API on next attempt
-                    use_direct_api = True
                 else:
                     print("All retries failed.")
-                    yield "❌ The model timed out after multiple attempts. Try again in a few minutes."
                     break
             else:
                 print(f"Non-timeout error: {error_message}")
@@ -146,7 +208,7 @@ For information on how to customize the ChatInterface, peruse the gradio docs: h
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
@@ -157,9 +219,14 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
-    description="This interface uses your fine-tuned Mistral model for Microsoft 365 data management. The first request may take some time as the model loads."
 )
 if __name__ == "__main__":
     demo.launch()

 import time
 import json
 import requests
+import threading
 from huggingface_hub.errors import HfHubHTTPError
 """
 API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
 headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+# Global variable to track if model is warmed up
+model_warmed_up = False
+warming_up = False
+def warm_up_model():
+    """Send a warmup request to get the model loaded before user interaction"""
+    global warming_up, model_warmed_up
+    if warming_up:
+        return  # Already warming up
+    warming_up = True
+    print("Starting model warm-up...")
+    # Simple warmup message
+    warmup_messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hello"}
+    ]
+    # Try direct API approach first
+    try:
+        payload = {
+            "inputs": warmup_messages,
+            "parameters": {
+                "max_new_tokens": 5,  # Just need a short response
+                "temperature": 0.1,
+                "top_p": 0.95,
+            },
+            "stream": False,
+        }
+        print("Sending warmup request...")
+        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+        if response.status_code == 200:
+            print("Warmup successful!")
+            model_warmed_up = True
+        else:
+            print(f"Warmup API call failed with status {response.status_code}")
+            print(f"Response: {response.text}")
+    except Exception as e:
+        print(f"Warmup exception: {str(e)}")
+    # Even if it failed, mark as no longer warming up
+    warming_up = False
+# Start warmup in background thread
+threading.Thread(target=warm_up_model, daemon=True).start()
 def respond(
     message,
     temperature,
     top_p,
 ):
+    global model_warmed_up
+    # If model isn't warmed up yet, give a message
+    if not model_warmed_up:
+        yield "⌛ Model is being loaded for the first time, this may take up to a minute. Please be patient..."
     messages = [{"role": "system", "content": system_message}]
     for val in history:
     print(f"Sending messages: {json.dumps(messages, indent=2)}")
     # Try to initialize the model with retries
+    max_retries = 5  # Increased from 3 to 5
     retry_count = 0
     # Try both methods: InferenceClient and direct API call
                     stream=True,
                     temperature=temperature,
                     top_p=top_p,
+                    timeout=30,  # Increased timeout
                 ):
                     token = message.choices[0].delta.content
                     if token:
                         response += token
                         yield response
                 # If we got here, we were successful
+                model_warmed_up = True
                 break
             else:
                 # Method 2: Direct API call
                 }
                 print(f"Making direct API call to {API_URL}")
+                api_response = requests.post(API_URL, headers=headers, json=payload, timeout=60)  # Increased timeout
                 print(f"API response status: {api_response.status_code}")
                 if api_response.status_code == 200:
                     if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
                         response = result[0]["generated_text"]
                         yield response
+                        model_warmed_up = True
                         break
                     else:
                         print(f"Unexpected API response format: {result}")
                     print(f"API error: {api_response.text}")
                     if api_response.status_code == 504 and retry_count < max_retries - 1:
                         retry_count += 1
+                        wait_time = 15  # Increased wait time
                         yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
+                        time.sleep(wait_time)
                     else:
                         yield f"❌ API error: {api_response.status_code} - {api_response.text}"
                         break
             if "504 Server Error: Gateway Timeout" in error_message:
                 if retry_count < max_retries - 1:
+                    wait_time = 15  # Increased wait time
                     print(f"Model timed out. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
                     yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                     time.sleep(wait_time)
+                    # Try direct API on next attempt if we've tried InferenceClient twice
+                    if retry_count >= 2:
+                        use_direct_api = True
                 else:
                     print("All retries failed.")
+                    yield "❌ The model timed out after multiple attempts. Your model is probably too large for the free tier. Try again in a few minutes or consider using a smaller model."
                     break
             else:
                 print(f"Non-timeout error: {error_message}")
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(value="You are a data management expert specializing in Microsoft 365 services.", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             label="Top-p (nucleus sampling)",
         ),
     ],
+    description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
+    ⚠️ Note: This model needs time to load when first used. You may experience a delay of up to 60 seconds on your first message."""
 )
 if __name__ == "__main__":
+    # Start model warmup
+    warm_up_model()
+    # Launch the app
     demo.launch()