Spaces:

shukdevdattaEX
/

Content-Moderation-Analyser

Paused

App Files Files Community

shukdevdattaEX commited on Jan 5

Commit

f3d060a

verified ·

1 Parent(s): 0fa42d9

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -29

app.py CHANGED Viewed

@@ -68,12 +68,11 @@ S13) Elections: Content containing factually incorrect information about elector
 S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
-For each piece of content, provide a detailed analysis with:
 1. A clear SAFE or UNSAFE determination
 2. If UNSAFE, list ALL applicable category codes (S1-S14)
-3. A comprehensive explanation of why the content violates each flagged category
 4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
-5. Specific reasoning for your determination
 Be thorough, objective, and explain your reasoning clearly."""
@@ -96,27 +95,25 @@ def moderate_content(api_key, user_message, chat_history):
         # Call the moderation model with system prompt
         chat_completion = client.chat.completions.create(
             messages=[
                 {
                     "role": "user",
-                    "content": user_message
                 }
             ],
             model="openai/gpt-oss-safeguard-20b",
             temperature=0.3,
-            max_tokens=2048,
         )
         # Get the response
         moderation_result = chat_completion.choices[0].message.content
-        # Debug: Print the raw response
-        print(f"Raw API Response: {moderation_result}")
-        # Format the response
-        if moderation_result and moderation_result.strip():
-            formatted_response = f"### 📊 Detailed Analysis:\n\n{moderation_result}"
-        else:
-            formatted_response = "### 📊 Detailed Analysis:\n\n⚠️ The model returned an empty response. Please try again."
         # Update chat history with proper message format
         user_msg = {"role": "user", "content": user_message}
@@ -126,12 +123,57 @@ def moderate_content(api_key, user_message, chat_history):
         return new_history, new_history
     except Exception as e:
-        error_message = f"❌ **Error:** {str(e)}\n\n**Details:**\n- Check if your API key is valid\n- Ensure you have credits in your Groq account\n- Try a different message\n\n**Error Type:** {type(e).__name__}"
         user_msg = {"role": "user", "content": user_message}
         assistant_msg = {"role": "assistant", "content": error_message}
         new_history = chat_history + [user_msg, assistant_msg]
         return new_history, new_history
 def clear_chat():
     """Clear the chat history"""
     return [], []
@@ -149,7 +191,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
     gr.Markdown("""
     # 🛡️ Advanced Content Moderation Chatbot
-    This chatbot uses Groq's GPT-OSS-Safeguard-20B model to analyze content against a comprehensive harm taxonomy.
     Enter your Groq API key and test content to see detailed moderation analysis.
     """)
@@ -172,7 +214,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
     chatbot = gr.Chatbot(
         label="Moderation Results",
         height=450,
-        show_label=True
     )
     with gr.Row():
@@ -271,7 +313,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
     ### ℹ️ About This Application
-    This application demonstrates advanced content moderation using AI. The model analyzes text against **14 harm categories**:
     | Category | Description |
     |----------|-------------|
@@ -292,11 +334,11 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
     ### 🎯 Key Features:
-    - ✅ **Direct Model Output**: Shows raw analysis from GPT-OSS-Safeguard-20B
-    - ✅ **Comprehensive Detection**: Identifies all applicable harm categories
-    - ✅ **Detailed Explanations**: Clear reasoning for each determination
     - ✅ **15 Example Queries**: One safe example + one for each harm category
-    - ✅ **Clean Output**: Direct model response without extra formatting
     ### 🔒 Privacy & Security:
@@ -304,14 +346,6 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
     - All processing happens via Groq's secure API
     - No content is logged or retained
-    ### 🐛 Troubleshooting:
-    If you see empty responses:
-    1. Verify your API key is correct
-    2. Check your Groq account has available credits
-    3. Ensure the model `openai/gpt-oss-safeguard-20b` is accessible
-    4. Check the console/terminal for debug output
     **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
     ---

 S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
+For each piece of content, provide:
 1. A clear SAFE or UNSAFE determination
 2. If UNSAFE, list ALL applicable category codes (S1-S14)
+3. A brief explanation of why the content violates each flagged category
 4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
 Be thorough, objective, and explain your reasoning clearly."""
         # Call the moderation model with system prompt
         chat_completion = client.chat.completions.create(
             messages=[
+                {
+                    "role": "system",
+                    "content": SYSTEM_PROMPT
+                },
                 {
                     "role": "user",
+                    "content": f"Analyze the following content for policy violations:\n\n{user_message}"
                 }
             ],
             model="openai/gpt-oss-safeguard-20b",
             temperature=0.3,
+            max_tokens=1024,
         )
         # Get the response
         moderation_result = chat_completion.choices[0].message.content
+        # Parse and format the response
+        formatted_response = format_moderation_response(moderation_result, user_message)
         # Update chat history with proper message format
         user_msg = {"role": "user", "content": user_message}
         return new_history, new_history
     except Exception as e:
+        error_message = f"❌ **Error:** {str(e)}\n\nPlease check your API key and try again."
         user_msg = {"role": "user", "content": user_message}
         assistant_msg = {"role": "assistant", "content": error_message}
         new_history = chat_history + [user_msg, assistant_msg]
         return new_history, new_history
+def format_moderation_response(result, original_content):
+    """
+    Format the moderation result into a readable response
+    """
+    try:
+        result_lower = result.lower()
+        # Build formatted response
+        response = "## 🛡️ Content Moderation Result\n\n"
+        response += f"**Analyzed Content:** _{original_content[:100]}{'...' if len(original_content) > 100 else ''}_\n\n"
+        # Check if content appears safe
+        if "safe" in result_lower and ("unsafe" not in result_lower or result_lower.index("safe") < result_lower.index("unsafe")):
+            response += "### ✅ Status: SAFE\n\n"
+            response += "The content appears to be appropriate and does not violate any harm policies.\n\n"
+        else:
+            response += "### ⚠️ Status: FLAGGED\n\n"
+            # Check for severity
+            if "critical" in result_lower:
+                response += "**Severity:** 🔴 CRITICAL\n\n"
+            elif "high" in result_lower:
+                response += "**Severity:** 🟠 HIGH\n\n"
+            elif "medium" in result_lower:
+                response += "**Severity:** 🟡 MEDIUM\n\n"
+            elif "low" in result_lower:
+                response += "**Severity:** 🟢 LOW\n\n"
+            # Check for specific categories
+            flagged_categories = []
+            for code, category in HARM_CATEGORIES.items():
+                if code.lower() in result_lower or code in result:
+                    flagged_categories.append(f"- **{code}: {category}** - {HARM_DESCRIPTIONS[code]}")
+            if flagged_categories:
+                response += "**Flagged Categories:**\n" + "\n".join(flagged_categories) + "\n\n"
+        # Add detailed analysis
+        response += "---\n\n### 📊 Detailed Analysis:\n\n" + result
+        return response
+    except Exception as e:
+        return f"**Moderation Analysis:**\n\n{result}"
 def clear_chat():
     """Clear the chat history"""
     return [], []
     gr.Markdown("""
     # 🛡️ Advanced Content Moderation Chatbot
+    This chatbot uses Groq's GPT-OSS-Safeguard-20B model with an enhanced system prompt to analyze content against a comprehensive harm taxonomy.
     Enter your Groq API key and test content to see detailed moderation analysis.
     """)
     chatbot = gr.Chatbot(
         label="Moderation Results",
         height=450,
+        show_label=True,
     )
     with gr.Row():
     ### ℹ️ About This Application
+    This application demonstrates advanced content moderation using AI with system prompts. The model analyzes text against **14 harm categories**:
     | Category | Description |
     |----------|-------------|
     ### 🎯 Key Features:
+    - ✅ **Enhanced System Prompt**: Detailed instructions for comprehensive analysis
+    - ✅ **Severity Levels**: LOW, MEDIUM, HIGH, or CRITICAL risk assessment
+    - ✅ **Category Detection**: Identifies all applicable harm categories
+    - ✅ **Detailed Explanations**: Clear reasoning for each flag
     - ✅ **15 Example Queries**: One safe example + one for each harm category
     ### 🔒 Privacy & Security:
     - All processing happens via Groq's secure API
     - No content is logged or retained
     **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
     ---