Spaces:

PlantWisdom
/

Data_Management

Sleeping

App Files Files Community

Frankie-walsh4 commited on Mar 26, 2025

Commit

a03b5fc

1 Parent(s): 3908e5f

change for AI thinking

Browse files

Files changed (1) hide show

app.py +91 -60

app.py CHANGED Viewed

@@ -2,12 +2,39 @@ import gradio as gr
 from huggingface_hub import InferenceClient
 import time
 import html
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
 client = InferenceClient("Trinoid/Data_Management")
 def respond(
     message,
@@ -17,17 +44,19 @@ def respond(
     temperature,
     top_p,
 ):
-    # Add a special instruction to the system message to prevent thinking out loud and repetition
-    enhanced_system_message = system_message + """
-IMPORTANT INSTRUCTION: You must provide direct, authoritative answers based on your knowledge.
-DO NOT reveal your internal thinking process, planning, or self-questioning.
-DO NOT say phrases like "I need to figure out" or "I'll start by researching".
-DO NOT describe your approach to answering the question.
-DO NOT repeat yourself or get stuck in loops of similar content.
-Keep your response focused, structured, and concise.
-INSTEAD, provide concise, structured, and factual information directly.
-Answer as an authoritative expert with deep knowledge of Microsoft 365 services."""
     messages = [{"role": "system", "content": enhanced_system_message}]
@@ -39,63 +68,65 @@ Answer as an authoritative expert with deep knowledge of Microsoft 365 services.
     messages.append({"role": "user", "content": message})
     thinking_steps = []
     full_response = ""
     start_time = time.time()
-    repetition_count = 0
-    last_segment = ""
-    # Use chat completion instead of text generation
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        if not token:
-            continue
-        # Check for repetition by comparing with previous chunk
-        if len(full_response) > 100:
-            last_100_chars = full_response[-100:]
-            # If we find the same chunk repeating
-            if last_100_chars in full_response[:-100] and last_100_chars.strip():
-                repetition_count += 1
-                # If we detect significant repetition, abort this generation
-                if repetition_count > 2:
-                    # Trim off the repetitive part
-                    repetition_index = full_response.rfind(last_100_chars, 0, -100)
-                    if repetition_index > 0:
-                        full_response = full_response[:repetition_index] + "\n\n[Response trimmed to avoid repetition]"
                         break
-        full_response += token
-        # Save thinking steps at intervals
-        current_time = time.time()
-        if current_time - start_time > 2 or len(full_response) % 150 == 0:
-            start_time = current_time
-            thinking_steps.append(full_response)
-        # Store last segment for repetition detection
-        if len(full_response) % 50 == 0:
-            last_segment = full_response[-50:]
-        # Format with thinking history as HTML
-        if thinking_steps and len(thinking_steps) > 1:  # Only show if we have multiple steps
-            thinking_html = '<div class="thinking-wrapper"><details><summary>Show thinking process</summary><div class="thinking-steps">'
-            for i, step in enumerate(thinking_steps[:-1]):  # Exclude the current step
-                # Escape HTML to prevent rendering issues
-                safe_step = html.escape(step)
-                thinking_html += f'<div class="thinking-step">Step {i+1}: {safe_step}</div>'
             thinking_html += '</div></details></div>'
-            # Yield both thinking and current response
-            yield f"{thinking_html}{full_response}"
-        else:
-            yield full_response
 # Custom CSS for Plant Wisdom.AI styling

 from huggingface_hub import InferenceClient
 import time
 import html
+import re
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
 client = InferenceClient("Trinoid/Data_Management")
+def clean_response(text):
+    """Clean up response by removing meta-text and thinking artifacts"""
+    # Remove thinking phrases
+    thinking_patterns = [
+        r"I need to figure out",
+        r"I'll start by",
+        r"Let me try to",
+        r"I'm trying to understand",
+        r"First, I know that",
+        r"I'll need to look into",
+        r"I'm not entirely sure",
+        r"I believe this is",
+        r"I imagine it involves",
+    ]
+    for pattern in thinking_patterns:
+        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
+    # Remove repeating paragraphs
+    paragraphs = text.split('\n\n')
+    unique_paragraphs = []
+    for p in paragraphs:
+        if p and p not in unique_paragraphs and len(p.strip()) > 20:
+            unique_paragraphs.append(p)
+    return '\n\n'.join(unique_paragraphs)
 def respond(
     message,
     temperature,
     top_p,
 ):
+    # Create a more structured system prompt
+    enhanced_system_message = f"""
+{system_message}
+IMPORTANT INSTRUCTIONS FOR YOUR RESPONSES:
+1. PROVIDE DIRECT, AUTHORITATIVE, AND COMPLETE ANSWERS ABOUT MICROSOFT 365 AND DATA MANAGEMENT.
+2. DO NOT USE PHRASES LIKE "I think", "I believe", "I'm not sure", "I'll try to", "First, I need to".
+3. DO NOT INCLUDE YOUR THINKING PROCESS IN RESPONSES.
+4. USE CLEAR STRUCTURE WITH HEADINGS AND BULLET POINTS WHERE APPROPRIATE.
+5. BE CONCISE AND FOCUSED - AVOID UNNECESSARY REPETITION.
+6. WHEN ANSWERING QUESTIONS ABOUT DOCUMENT MANAGEMENT, PROVIDE SPECIFIC DETAILS ABOUT THE ACTUAL TOOLS AND FEATURES.
+7. ANSWER AS A MICROSOFT 365 EXPERT WITH AUTHORITATIVE KNOWLEDGE.
+"""
     messages = [{"role": "system", "content": enhanced_system_message}]
     messages.append({"role": "user", "content": message})
+    # Track generation state
     thinking_steps = []
     full_response = ""
     start_time = time.time()
+    last_token_time = time.time()
+    try:
+        # Use chat completion
+        for message in client.chat_completion(
+            messages,
+            max_tokens=max_tokens,
+            stream=True,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            token = message.choices[0].delta.content
+            if not token:
+                # Check for long pause between tokens (potential stall)
+                current_time = time.time()
+                if current_time - last_token_time > 5:  # 5 second timeout
+                    if full_response:
                         break
+                continue
+            last_token_time = time.time()
+            full_response += token
+            # Save thinking steps for display only
+            current_time = time.time()
+            if current_time - start_time > 2 or len(full_response) % 200 == 0:
+                start_time = current_time
+                thinking_steps.append(full_response)
+            # Format with thinking history as HTML
+            if thinking_steps and len(thinking_steps) > 1:
+                thinking_html = '<div class="thinking-wrapper"><details><summary>Show thinking process</summary><div class="thinking-steps">'
+                for i, step in enumerate(thinking_steps[:-1]):
+                    safe_step = html.escape(step)
+                    thinking_html += f'<div class="thinking-step">Step {i+1}: {safe_step}</div>'
+                thinking_html += '</div></details></div>'
+                # Always yield the full current response (no cleaning during generation)
+                yield f"{thinking_html}{full_response}"
+            else:
+                yield full_response
+        # Clean up the final response to remove thinking artifacts
+        if "I'm trying to understand" in full_response or "I need to figure out" in full_response:
+            cleaned_response = clean_response(full_response)
+            thinking_html = '<div class="thinking-wrapper"><details><summary>Show original response</summary><div class="thinking-steps">'
+            thinking_html += f'<div class="thinking-step">{html.escape(full_response)}</div>'
             thinking_html += '</div></details></div>'
+            yield f"{thinking_html}{cleaned_response}"
+    except Exception as e:
+        # Handle exceptions gracefully
+        error_message = f"I apologize, but I encountered an error while generating a response. Please try rephrasing your question or asking something else."
+        yield error_message
 # Custom CSS for Plant Wisdom.AI styling