shukdevdattaEX commited on
Commit
ac2086a
·
verified ·
1 Parent(s): 99fe1fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -31
app.py CHANGED
@@ -39,25 +39,42 @@ HARM_DESCRIPTIONS = {
39
 
40
  # System prompt for enhanced moderation
41
  SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
 
42
  S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
 
43
  S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
 
44
  S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
 
45
  S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
 
46
  S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
 
47
  S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
 
48
  S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
 
49
  S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
 
50
  S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
 
51
  S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
 
52
  S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
 
53
  S12) Sexual Content: Content that contains erotica.
 
54
  S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
 
55
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
56
- For each piece of content, provide:
 
57
  1. A clear SAFE or UNSAFE determination
58
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
59
- 3. A brief explanation of why the content violates each flagged category
60
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
 
 
61
  Be thorough, objective, and explain your reasoning clearly."""
62
 
63
  def moderate_content(api_key, user_message, chat_history):
@@ -79,25 +96,27 @@ def moderate_content(api_key, user_message, chat_history):
79
  # Call the moderation model with system prompt
80
  chat_completion = client.chat.completions.create(
81
  messages=[
82
- {
83
- "role": "system",
84
- "content": SYSTEM_PROMPT
85
- },
86
  {
87
  "role": "user",
88
- "content": f"Analyze the following content for policy violations:\n\n{user_message}"
89
  }
90
  ],
91
  model="openai/gpt-oss-safeguard-20b",
92
  temperature=0.3,
93
- max_tokens=1024,
94
  )
95
 
96
  # Get the response
97
  moderation_result = chat_completion.choices[0].message.content
98
 
99
- # Parse and format the response (only detailed analysis)
100
- formatted_response = format_moderation_response(moderation_result, user_message)
 
 
 
 
 
 
101
 
102
  # Update chat history with proper message format
103
  user_msg = {"role": "user", "content": user_message}
@@ -107,24 +126,12 @@ def moderate_content(api_key, user_message, chat_history):
107
  return new_history, new_history
108
 
109
  except Exception as e:
110
- error_message = f"❌ **Error:** {str(e)}\n\nPlease check your API key and try again."
111
  user_msg = {"role": "user", "content": user_message}
112
  assistant_msg = {"role": "assistant", "content": error_message}
113
  new_history = chat_history + [user_msg, assistant_msg]
114
  return new_history, new_history
115
 
116
- def format_moderation_response(result, original_content):
117
- """
118
- Format the moderation result - only show detailed analysis
119
- """
120
- try:
121
- # Simply return the detailed analysis with a header
122
- response = "### 📊 Detailed Analysis:\n\n" + result
123
- return response
124
-
125
- except Exception as e:
126
- return f"**Moderation Analysis:**\n\n{result}"
127
-
128
  def clear_chat():
129
  """Clear the chat history"""
130
  return [], []
@@ -142,7 +149,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
142
  gr.Markdown("""
143
  # 🛡️ Advanced Content Moderation Chatbot
144
 
145
- This chatbot uses Groq's GPT-OSS-Safeguard-20B model with an enhanced system prompt to analyze content against a comprehensive harm taxonomy.
146
  Enter your Groq API key and test content to see detailed moderation analysis.
147
  """)
148
 
@@ -165,7 +172,8 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
165
  chatbot = gr.Chatbot(
166
  label="Moderation Results",
167
  height=450,
168
- show_label=True
 
169
  )
170
 
171
  with gr.Row():
@@ -264,7 +272,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
264
 
265
  ### ℹ️ About This Application
266
 
267
- This application demonstrates advanced content moderation using AI with system prompts. The model analyzes text against **14 harm categories**:
268
 
269
  | Category | Description |
270
  |----------|-------------|
@@ -285,12 +293,11 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
285
 
286
  ### 🎯 Key Features:
287
 
288
- - ✅ **Enhanced System Prompt**: Detailed instructions for comprehensive analysis
289
- - ✅ **Severity Levels**: LOW, MEDIUM, HIGH, or CRITICAL risk assessment
290
- - ✅ **Category Detection**: Identifies all applicable harm categories
291
- - ✅ **Detailed Explanations**: Clear reasoning for each flag
292
  - ✅ **15 Example Queries**: One safe example + one for each harm category
293
- - ✅ **Clean Output**: Shows only the detailed analysis from the model
294
 
295
  ### 🔒 Privacy & Security:
296
 
@@ -298,6 +305,14 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
298
  - All processing happens via Groq's secure API
299
  - No content is logged or retained
300
 
 
 
 
 
 
 
 
 
301
  **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
302
 
303
  ---
 
39
 
40
  # System prompt for enhanced moderation
41
  SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
42
+
43
  S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
44
+
45
  S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
46
+
47
  S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
48
+
49
  S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
50
+
51
  S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
52
+
53
  S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
54
+
55
  S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
56
+
57
  S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
58
+
59
  S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
60
+
61
  S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
62
+
63
  S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
64
+
65
  S12) Sexual Content: Content that contains erotica.
66
+
67
  S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
68
+
69
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
70
+
71
+ For each piece of content, provide a detailed analysis with:
72
  1. A clear SAFE or UNSAFE determination
73
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
74
+ 3. A comprehensive explanation of why the content violates each flagged category
75
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
76
+ 5. Specific reasoning for your determination
77
+
78
  Be thorough, objective, and explain your reasoning clearly."""
79
 
80
  def moderate_content(api_key, user_message, chat_history):
 
96
  # Call the moderation model with system prompt
97
  chat_completion = client.chat.completions.create(
98
  messages=[
 
 
 
 
99
  {
100
  "role": "user",
101
+ "content": user_message
102
  }
103
  ],
104
  model="openai/gpt-oss-safeguard-20b",
105
  temperature=0.3,
106
+ max_tokens=2048,
107
  )
108
 
109
  # Get the response
110
  moderation_result = chat_completion.choices[0].message.content
111
 
112
+ # Debug: Print the raw response
113
+ print(f"Raw API Response: {moderation_result}")
114
+
115
+ # Format the response
116
+ if moderation_result and moderation_result.strip():
117
+ formatted_response = f"### 📊 Detailed Analysis:\n\n{moderation_result}"
118
+ else:
119
+ formatted_response = "### 📊 Detailed Analysis:\n\n⚠️ The model returned an empty response. Please try again."
120
 
121
  # Update chat history with proper message format
122
  user_msg = {"role": "user", "content": user_message}
 
126
  return new_history, new_history
127
 
128
  except Exception as e:
129
+ error_message = f"❌ **Error:** {str(e)}\n\n**Details:**\n- Check if your API key is valid\n- Ensure you have credits in your Groq account\n- Try a different message\n\n**Error Type:** {type(e).__name__}"
130
  user_msg = {"role": "user", "content": user_message}
131
  assistant_msg = {"role": "assistant", "content": error_message}
132
  new_history = chat_history + [user_msg, assistant_msg]
133
  return new_history, new_history
134
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def clear_chat():
136
  """Clear the chat history"""
137
  return [], []
 
149
  gr.Markdown("""
150
  # 🛡️ Advanced Content Moderation Chatbot
151
 
152
+ This chatbot uses Groq's GPT-OSS-Safeguard-20B model to analyze content against a comprehensive harm taxonomy.
153
  Enter your Groq API key and test content to see detailed moderation analysis.
154
  """)
155
 
 
172
  chatbot = gr.Chatbot(
173
  label="Moderation Results",
174
  height=450,
175
+ show_label=True,
176
+ type="messages"
177
  )
178
 
179
  with gr.Row():
 
272
 
273
  ### ℹ️ About This Application
274
 
275
+ This application demonstrates advanced content moderation using AI. The model analyzes text against **14 harm categories**:
276
 
277
  | Category | Description |
278
  |----------|-------------|
 
293
 
294
  ### 🎯 Key Features:
295
 
296
+ - ✅ **Direct Model Output**: Shows raw analysis from GPT-OSS-Safeguard-20B
297
+ - ✅ **Comprehensive Detection**: Identifies all applicable harm categories
298
+ - ✅ **Detailed Explanations**: Clear reasoning for each determination
 
299
  - ✅ **15 Example Queries**: One safe example + one for each harm category
300
+ - ✅ **Clean Output**: Direct model response without extra formatting
301
 
302
  ### 🔒 Privacy & Security:
303
 
 
305
  - All processing happens via Groq's secure API
306
  - No content is logged or retained
307
 
308
+ ### 🐛 Troubleshooting:
309
+
310
+ If you see empty responses:
311
+ 1. Verify your API key is correct
312
+ 2. Check your Groq account has available credits
313
+ 3. Ensure the model `openai/gpt-oss-safeguard-20b` is accessible
314
+ 4. Check the console/terminal for debug output
315
+
316
  **Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
317
 
318
  ---