Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3, 2025

Commit

b07886a

1 Parent(s): a2cebb0

fixes

Browse files

Files changed (1) hide show

app.py +57 -31

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ try:
     print(f"Status: {response.status_code}")
     if response.status_code == 200:
         print("Model exists and is accessible")
     else:
         print(f"Response: {response.text}")
 except Exception as e:
@@ -32,22 +33,46 @@ except Exception as e:
 # Global variable to track model status
 model_loaded = False
-model_loading = False
 estimated_time = None
-def query_model(messages, parameters=None):
-    """Query the model using the Inference API"""
     payload = {
-        "inputs": messages,
     }
     if parameters:
         payload["parameters"] = parameters
-    print(f"Sending query to API...")
     try:
-        # Single attempt with longer timeout
         response = requests.post(
             API_URL,
             headers=headers,
@@ -59,6 +84,7 @@ def query_model(messages, parameters=None):
         # If successful, return the response
         if response.status_code == 200:
             return response.json()
         # If model is loading, handle it
@@ -88,7 +114,7 @@ def respond(
 ):
     """Respond to user messages"""
-    # Create the messages list in chat format
     messages = [{"role": "system", "content": system_message}]
     for val in history:
@@ -99,12 +125,16 @@ def respond(
     messages.append({"role": "user", "content": message})
-    # Set up the inference parameters
     parameters = {
         "max_new_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
-        "do_sample": True
     }
     # Initial message about model status
@@ -126,43 +156,38 @@ def respond(
             time.sleep(wait_time)
         try:
-            # Query the model
-            result = query_model(messages, parameters)
             if result:
                 # Handle different response formats
-                # List format with generated_text
                 if isinstance(result, list) and len(result) > 0:
-                    if "generated_text" in result[0]:
                         yield result[0]["generated_text"]
                         return
-                # Direct message format
                 if isinstance(result, dict) and "generated_text" in result:
                     yield result["generated_text"]
                     return
-                # String format
-                if isinstance(result, str):
-                    yield result
-                    return
-                # Raw format as fallback
                 yield str(result)
                 return
             # If model is still loading, get the latest estimate
             if estimated_time and attempt < max_retries - 1:
-                response = requests.get(API_URL, headers=headers)
-                if response.status_code == 503 and "estimated_time" in response.json():
-                    estimated_time = response.json()["estimated_time"]
-                    print(f"Updated loading time: {estimated_time:.0f} seconds")
         except Exception as e:
             print(f"Error in attempt {attempt+1}: {str(e)}")
             if attempt == max_retries - 1:
-                yield f"""❌ Sorry, I couldn't generate a response after several attempts.
 Error details: {str(e)}
@@ -176,7 +201,10 @@ This could be due to:
 2. The model being too large for the current hardware
 3. Temporary service issues
-Please try again later."""
 """
@@ -197,11 +225,9 @@ demo = gr.ChatInterface(
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
-    The model is accessed via the Hugging Face Inference API.
     First requests may take 2-3 minutes as the model loads."""
 )
 if __name__ == "__main__":
-    # Launch the app
     demo.launch()

     print(f"Status: {response.status_code}")
     if response.status_code == 200:
         print("Model exists and is accessible")
+        print(f"Response: {response.text[:200]}...")
     else:
         print(f"Response: {response.text}")
 except Exception as e:
 # Global variable to track model status
 model_loaded = False
 estimated_time = None
+use_simple_format = True  # Toggle to use simpler format instead of chat format
+def format_prompt(messages):
+    """Format chat messages into a text prompt that Mistral models can understand"""
+    if use_simple_format:
+        # Simple format - just extract the message content
+        system = next((m["content"] for m in messages if m["role"] == "system"), "")
+        last_user_msg = next((m["content"] for m in reversed(messages) if m["role"] == "user"), "")
+        if system:
+            return f"{system}\n\nQuestion: {last_user_msg}\n\nAnswer:"
+        else:
+            return f"Question: {last_user_msg}\n\nAnswer:"
+    else:
+        # Chat format for Mistral models
+        formatted = ""
+        for msg in messages:
+            if msg["role"] == "system":
+                formatted += f"<s>[INST] {msg['content']} [/INST]</s>\n"
+            elif msg["role"] == "user":
+                formatted += f"<s>[INST] {msg['content']} [/INST]"
+            elif msg["role"] == "assistant":
+                formatted += f" {msg['content']} </s>\n"
+        return formatted
+def query_model_text_generation(prompt, parameters=None):
+    """Query the model using the text generation API endpoint"""
     payload = {
+        "inputs": prompt,
     }
     if parameters:
         payload["parameters"] = parameters
+    print(f"Sending text generation query to API...")
+    print(f"Prompt: {prompt[:100]}...")
     try:
+        # Try with longer timeout
         response = requests.post(
             API_URL,
             headers=headers,
         # If successful, return the response
         if response.status_code == 200:
+            print(f"Success! Response: {str(response.text)[:200]}...")
             return response.json()
         # If model is loading, handle it
 ):
     """Respond to user messages"""
+    # Create the messages list
     messages = [{"role": "system", "content": system_message}]
     for val in history:
     messages.append({"role": "user", "content": message})
+    # Format the prompt
+    prompt = format_prompt(messages)
+    # Set up the generation parameters
     parameters = {
         "max_new_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
+        "do_sample": True,
+        "return_full_text": False  # Only return the generated text, not the prompt
     }
     # Initial message about model status
             time.sleep(wait_time)
         try:
+            # Query the model using text generation
+            result = query_model_text_generation(prompt, parameters)
             if result:
                 # Handle different response formats
                 if isinstance(result, list) and len(result) > 0:
+                    if isinstance(result[0], dict) and "generated_text" in result[0]:
                         yield result[0]["generated_text"]
                         return
                 if isinstance(result, dict) and "generated_text" in result:
                     yield result["generated_text"]
                     return
+                # String or other format
                 yield str(result)
                 return
             # If model is still loading, get the latest estimate
             if estimated_time and attempt < max_retries - 1:
+                try:
+                    response = requests.get(API_URL, headers=headers)
+                    if response.status_code == 503 and "estimated_time" in response.json():
+                        estimated_time = response.json()["estimated_time"]
+                        print(f"Updated loading time: {estimated_time:.0f} seconds")
+                except:
+                    pass
         except Exception as e:
             print(f"Error in attempt {attempt+1}: {str(e)}")
             if attempt == max_retries - 1:
+                yield f"""❌ Sorry, I couldn't generate a response after multiple attempts.
 Error details: {str(e)}
 2. The model being too large for the current hardware
 3. Temporary service issues
+Please try again later. For best results with large models like Mistral-7B, consider:
+- Using a smaller model
+- Creating a 4-bit quantized version
+- Using Hugging Face Inference Endpoints instead of Spaces"""
 """
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
     First requests may take 2-3 minutes as the model loads."""
 )
 if __name__ == "__main__":
     demo.launch()