shukdevdattaEX commited on
Commit
f3d060a
·
verified ·
1 Parent(s): 0fa42d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -29
app.py CHANGED
@@ -68,12 +68,11 @@ S13) Elections: Content containing factually incorrect information about elector
68
 
69
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
70
 
71
- For each piece of content, provide a detailed analysis with:
72
  1. A clear SAFE or UNSAFE determination
73
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
74
- 3. A comprehensive explanation of why the content violates each flagged category
75
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
76
- 5. Specific reasoning for your determination
77
 
78
  Be thorough, objective, and explain your reasoning clearly."""
79
 
@@ -96,27 +95,25 @@ def moderate_content(api_key, user_message, chat_history):
96
  # Call the moderation model with system prompt
97
  chat_completion = client.chat.completions.create(
98
  messages=[
 
 
 
 
99
  {
100
  "role": "user",
101
- "content": user_message
102
  }
103
  ],
104
  model="openai/gpt-oss-safeguard-20b",
105
  temperature=0.3,
106
- max_tokens=2048,
107
  )
108
 
109
  # Get the response
110
  moderation_result = chat_completion.choices[0].message.content
111
 
112
- # Debug: Print the raw response
113
- print(f"Raw API Response: {moderation_result}")
114
-
115
- # Format the response
116
- if moderation_result and moderation_result.strip():
117
- formatted_response = f"### 📊 Detailed Analysis:\n\n{moderation_result}"
118
- else:
119
- formatted_response = "### 📊 Detailed Analysis:\n\n⚠️ The model returned an empty response. Please try again."
120
 
121
  # Update chat history with proper message format
122
  user_msg = {"role": "user", "content": user_message}
@@ -126,12 +123,57 @@ def moderate_content(api_key, user_message, chat_history):
126
  return new_history, new_history
127
 
128
  except Exception as e:
129
- error_message = f"❌ **Error:** {str(e)}\n\n**Details:**\n- Check if your API key is valid\n- Ensure you have credits in your Groq account\n- Try a different message\n\n**Error Type:** {type(e).__name__}"
130
  user_msg = {"role": "user", "content": user_message}
131
  assistant_msg = {"role": "assistant", "content": error_message}
132
  new_history = chat_history + [user_msg, assistant_msg]
133
  return new_history, new_history
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def clear_chat():
136
  """Clear the chat history"""
137
  return [], []
@@ -149,7 +191,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
149
  gr.Markdown("""
150
  # 🛡️ Advanced Content Moderation Chatbot
151
 
152
- This chatbot uses Groq's GPT-OSS-Safeguard-20B model to analyze content against a comprehensive harm taxonomy.
153
  Enter your Groq API key and test content to see detailed moderation analysis.
154
  """)
155
 
@@ -172,7 +214,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
172
  chatbot = gr.Chatbot(
173
  label="Moderation Results",
174
  height=450,
175
- show_label=True
176
  )
177
 
178
  with gr.Row():
@@ -271,7 +313,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
271
 
272
  ### ℹ️ About This Application
273
 
274
- This application demonstrates advanced content moderation using AI. The model analyzes text against **14 harm categories**:
275
 
276
  | Category | Description |
277
  |----------|-------------|
@@ -292,11 +334,11 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
292
 
293
  ### 🎯 Key Features:
294
 
295
- - ✅ **Direct Model Output**: Shows raw analysis from GPT-OSS-Safeguard-20B
296
- - ✅ **Comprehensive Detection**: Identifies all applicable harm categories
297
- - ✅ **Detailed Explanations**: Clear reasoning for each determination
 
298
  - ✅ **15 Example Queries**: One safe example + one for each harm category
299
- - ✅ **Clean Output**: Direct model response without extra formatting
300
 
301
  ### 🔒 Privacy & Security:
302
 
@@ -304,14 +346,6 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
304
  - All processing happens via Groq's secure API
305
  - No content is logged or retained
306
 
307
- ### 🐛 Troubleshooting:
308
-
309
- If you see empty responses:
310
- 1. Verify your API key is correct
311
- 2. Check your Groq account has available credits
312
- 3. Ensure the model `openai/gpt-oss-safeguard-20b` is accessible
313
- 4. Check the console/terminal for debug output
314
-
315
  **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
316
 
317
  ---
 
68
 
69
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
70
 
71
+ For each piece of content, provide:
72
  1. A clear SAFE or UNSAFE determination
73
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
74
+ 3. A brief explanation of why the content violates each flagged category
75
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
 
76
 
77
  Be thorough, objective, and explain your reasoning clearly."""
78
 
 
95
  # Call the moderation model with system prompt
96
  chat_completion = client.chat.completions.create(
97
  messages=[
98
+ {
99
+ "role": "system",
100
+ "content": SYSTEM_PROMPT
101
+ },
102
  {
103
  "role": "user",
104
+ "content": f"Analyze the following content for policy violations:\n\n{user_message}"
105
  }
106
  ],
107
  model="openai/gpt-oss-safeguard-20b",
108
  temperature=0.3,
109
+ max_tokens=1024,
110
  )
111
 
112
  # Get the response
113
  moderation_result = chat_completion.choices[0].message.content
114
 
115
+ # Parse and format the response
116
+ formatted_response = format_moderation_response(moderation_result, user_message)
 
 
 
 
 
 
117
 
118
  # Update chat history with proper message format
119
  user_msg = {"role": "user", "content": user_message}
 
123
  return new_history, new_history
124
 
125
  except Exception as e:
126
+ error_message = f"❌ **Error:** {str(e)}\n\nPlease check your API key and try again."
127
  user_msg = {"role": "user", "content": user_message}
128
  assistant_msg = {"role": "assistant", "content": error_message}
129
  new_history = chat_history + [user_msg, assistant_msg]
130
  return new_history, new_history
131
 
132
+ def format_moderation_response(result, original_content):
133
+ """
134
+ Format the moderation result into a readable response
135
+ """
136
+ try:
137
+ result_lower = result.lower()
138
+
139
+ # Build formatted response
140
+ response = "## 🛡️ Content Moderation Result\n\n"
141
+ response += f"**Analyzed Content:** _{original_content[:100]}{'...' if len(original_content) > 100 else ''}_\n\n"
142
+
143
+ # Check if content appears safe
144
+ if "safe" in result_lower and ("unsafe" not in result_lower or result_lower.index("safe") < result_lower.index("unsafe")):
145
+ response += "### ✅ Status: SAFE\n\n"
146
+ response += "The content appears to be appropriate and does not violate any harm policies.\n\n"
147
+ else:
148
+ response += "### ⚠️ Status: FLAGGED\n\n"
149
+
150
+ # Check for severity
151
+ if "critical" in result_lower:
152
+ response += "**Severity:** 🔴 CRITICAL\n\n"
153
+ elif "high" in result_lower:
154
+ response += "**Severity:** 🟠 HIGH\n\n"
155
+ elif "medium" in result_lower:
156
+ response += "**Severity:** 🟡 MEDIUM\n\n"
157
+ elif "low" in result_lower:
158
+ response += "**Severity:** 🟢 LOW\n\n"
159
+
160
+ # Check for specific categories
161
+ flagged_categories = []
162
+ for code, category in HARM_CATEGORIES.items():
163
+ if code.lower() in result_lower or code in result:
164
+ flagged_categories.append(f"- **{code}: {category}** - {HARM_DESCRIPTIONS[code]}")
165
+
166
+ if flagged_categories:
167
+ response += "**Flagged Categories:**\n" + "\n".join(flagged_categories) + "\n\n"
168
+
169
+ # Add detailed analysis
170
+ response += "---\n\n### 📊 Detailed Analysis:\n\n" + result
171
+
172
+ return response
173
+
174
+ except Exception as e:
175
+ return f"**Moderation Analysis:**\n\n{result}"
176
+
177
  def clear_chat():
178
  """Clear the chat history"""
179
  return [], []
 
191
  gr.Markdown("""
192
  # 🛡️ Advanced Content Moderation Chatbot
193
 
194
+ This chatbot uses Groq's GPT-OSS-Safeguard-20B model with an enhanced system prompt to analyze content against a comprehensive harm taxonomy.
195
  Enter your Groq API key and test content to see detailed moderation analysis.
196
  """)
197
 
 
214
  chatbot = gr.Chatbot(
215
  label="Moderation Results",
216
  height=450,
217
+ show_label=True,
218
  )
219
 
220
  with gr.Row():
 
313
 
314
  ### ℹ️ About This Application
315
 
316
+ This application demonstrates advanced content moderation using AI with system prompts. The model analyzes text against **14 harm categories**:
317
 
318
  | Category | Description |
319
  |----------|-------------|
 
334
 
335
  ### 🎯 Key Features:
336
 
337
+ - ✅ **Enhanced System Prompt**: Detailed instructions for comprehensive analysis
338
+ - ✅ **Severity Levels**: LOW, MEDIUM, HIGH, or CRITICAL risk assessment
339
+ - ✅ **Category Detection**: Identifies all applicable harm categories
340
+ - ✅ **Detailed Explanations**: Clear reasoning for each flag
341
  - ✅ **15 Example Queries**: One safe example + one for each harm category
 
342
 
343
  ### 🔒 Privacy & Security:
344
 
 
346
  - All processing happens via Groq's secure API
347
  - No content is logged or retained
348
 
 
 
 
 
 
 
 
 
349
  **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
350
 
351
  ---