Update app.py
Browse files
app.py
CHANGED
|
@@ -39,25 +39,42 @@ HARM_DESCRIPTIONS = {
|
|
| 39 |
|
| 40 |
# System prompt for enhanced moderation
|
| 41 |
SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
|
|
|
|
| 42 |
S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
|
|
|
|
| 43 |
S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
|
|
|
|
| 44 |
S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
|
|
|
|
| 45 |
S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
|
|
|
|
| 46 |
S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
|
|
|
|
| 47 |
S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
|
|
|
|
| 48 |
S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
|
|
|
|
| 49 |
S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
|
|
|
|
| 50 |
S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
|
|
|
|
| 51 |
S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
|
|
|
|
| 52 |
S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
|
|
|
|
| 53 |
S12) Sexual Content: Content that contains erotica.
|
|
|
|
| 54 |
S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
|
|
|
|
| 55 |
S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
|
| 56 |
-
|
|
|
|
| 57 |
1. A clear SAFE or UNSAFE determination
|
| 58 |
2. If UNSAFE, list ALL applicable category codes (S1-S14)
|
| 59 |
-
3. A
|
| 60 |
4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
|
|
|
|
|
|
|
| 61 |
Be thorough, objective, and explain your reasoning clearly."""
|
| 62 |
|
| 63 |
def moderate_content(api_key, user_message, chat_history):
|
|
@@ -79,25 +96,27 @@ def moderate_content(api_key, user_message, chat_history):
|
|
| 79 |
# Call the moderation model with system prompt
|
| 80 |
chat_completion = client.chat.completions.create(
|
| 81 |
messages=[
|
| 82 |
-
{
|
| 83 |
-
"role": "system",
|
| 84 |
-
"content": SYSTEM_PROMPT
|
| 85 |
-
},
|
| 86 |
{
|
| 87 |
"role": "user",
|
| 88 |
-
"content":
|
| 89 |
}
|
| 90 |
],
|
| 91 |
model="openai/gpt-oss-safeguard-20b",
|
| 92 |
temperature=0.3,
|
| 93 |
-
max_tokens=
|
| 94 |
)
|
| 95 |
|
| 96 |
# Get the response
|
| 97 |
moderation_result = chat_completion.choices[0].message.content
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
# Update chat history with proper message format
|
| 103 |
user_msg = {"role": "user", "content": user_message}
|
|
@@ -107,24 +126,12 @@ def moderate_content(api_key, user_message, chat_history):
|
|
| 107 |
return new_history, new_history
|
| 108 |
|
| 109 |
except Exception as e:
|
| 110 |
-
error_message = f"❌ **Error:** {str(e)}\n\
|
| 111 |
user_msg = {"role": "user", "content": user_message}
|
| 112 |
assistant_msg = {"role": "assistant", "content": error_message}
|
| 113 |
new_history = chat_history + [user_msg, assistant_msg]
|
| 114 |
return new_history, new_history
|
| 115 |
|
| 116 |
-
def format_moderation_response(result, original_content):
|
| 117 |
-
"""
|
| 118 |
-
Format the moderation result - only show detailed analysis
|
| 119 |
-
"""
|
| 120 |
-
try:
|
| 121 |
-
# Simply return the detailed analysis with a header
|
| 122 |
-
response = "### 📊 Detailed Analysis:\n\n" + result
|
| 123 |
-
return response
|
| 124 |
-
|
| 125 |
-
except Exception as e:
|
| 126 |
-
return f"**Moderation Analysis:**\n\n{result}"
|
| 127 |
-
|
| 128 |
def clear_chat():
|
| 129 |
"""Clear the chat history"""
|
| 130 |
return [], []
|
|
@@ -142,7 +149,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
|
|
| 142 |
gr.Markdown("""
|
| 143 |
# 🛡️ Advanced Content Moderation Chatbot
|
| 144 |
|
| 145 |
-
This chatbot uses Groq's GPT-OSS-Safeguard-20B model
|
| 146 |
Enter your Groq API key and test content to see detailed moderation analysis.
|
| 147 |
""")
|
| 148 |
|
|
@@ -165,7 +172,8 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
|
|
| 165 |
chatbot = gr.Chatbot(
|
| 166 |
label="Moderation Results",
|
| 167 |
height=450,
|
| 168 |
-
show_label=True
|
|
|
|
| 169 |
)
|
| 170 |
|
| 171 |
with gr.Row():
|
|
@@ -264,7 +272,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
|
|
| 264 |
|
| 265 |
### ℹ️ About This Application
|
| 266 |
|
| 267 |
-
This application demonstrates advanced content moderation using AI
|
| 268 |
|
| 269 |
| Category | Description |
|
| 270 |
|----------|-------------|
|
|
@@ -285,12 +293,11 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
|
|
| 285 |
|
| 286 |
### 🎯 Key Features:
|
| 287 |
|
| 288 |
-
- ✅ **
|
| 289 |
-
- ✅ **
|
| 290 |
-
- ✅ **
|
| 291 |
-
- ✅ **Detailed Explanations**: Clear reasoning for each flag
|
| 292 |
- ✅ **15 Example Queries**: One safe example + one for each harm category
|
| 293 |
-
- ✅ **Clean Output**:
|
| 294 |
|
| 295 |
### 🔒 Privacy & Security:
|
| 296 |
|
|
@@ -298,6 +305,14 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
|
|
| 298 |
- All processing happens via Groq's secure API
|
| 299 |
- No content is logged or retained
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
**Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
|
| 302 |
|
| 303 |
---
|
|
|
|
| 39 |
|
| 40 |
# System prompt for enhanced moderation
|
| 41 |
SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
|
| 42 |
+
|
| 43 |
S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
|
| 44 |
+
|
| 45 |
S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
|
| 46 |
+
|
| 47 |
S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
|
| 48 |
+
|
| 49 |
S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
|
| 50 |
+
|
| 51 |
S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
|
| 52 |
+
|
| 53 |
S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
|
| 54 |
+
|
| 55 |
S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
|
| 56 |
+
|
| 57 |
S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
|
| 58 |
+
|
| 59 |
S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
|
| 60 |
+
|
| 61 |
S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
|
| 62 |
+
|
| 63 |
S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
|
| 64 |
+
|
| 65 |
S12) Sexual Content: Content that contains erotica.
|
| 66 |
+
|
| 67 |
S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
|
| 68 |
+
|
| 69 |
S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
|
| 70 |
+
|
| 71 |
+
For each piece of content, provide a detailed analysis with:
|
| 72 |
1. A clear SAFE or UNSAFE determination
|
| 73 |
2. If UNSAFE, list ALL applicable category codes (S1-S14)
|
| 74 |
+
3. A comprehensive explanation of why the content violates each flagged category
|
| 75 |
4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
|
| 76 |
+
5. Specific reasoning for your determination
|
| 77 |
+
|
| 78 |
Be thorough, objective, and explain your reasoning clearly."""
|
| 79 |
|
| 80 |
def moderate_content(api_key, user_message, chat_history):
|
|
|
|
| 96 |
# Call the moderation model with system prompt
|
| 97 |
chat_completion = client.chat.completions.create(
|
| 98 |
messages=[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
{
|
| 100 |
"role": "user",
|
| 101 |
+
"content": user_message
|
| 102 |
}
|
| 103 |
],
|
| 104 |
model="openai/gpt-oss-safeguard-20b",
|
| 105 |
temperature=0.3,
|
| 106 |
+
max_tokens=2048,
|
| 107 |
)
|
| 108 |
|
| 109 |
# Get the response
|
| 110 |
moderation_result = chat_completion.choices[0].message.content
|
| 111 |
|
| 112 |
+
# Debug: Print the raw response
|
| 113 |
+
print(f"Raw API Response: {moderation_result}")
|
| 114 |
+
|
| 115 |
+
# Format the response
|
| 116 |
+
if moderation_result and moderation_result.strip():
|
| 117 |
+
formatted_response = f"### 📊 Detailed Analysis:\n\n{moderation_result}"
|
| 118 |
+
else:
|
| 119 |
+
formatted_response = "### 📊 Detailed Analysis:\n\n⚠️ The model returned an empty response. Please try again."
|
| 120 |
|
| 121 |
# Update chat history with proper message format
|
| 122 |
user_msg = {"role": "user", "content": user_message}
|
|
|
|
| 126 |
return new_history, new_history
|
| 127 |
|
| 128 |
except Exception as e:
|
| 129 |
+
error_message = f"❌ **Error:** {str(e)}\n\n**Details:**\n- Check if your API key is valid\n- Ensure you have credits in your Groq account\n- Try a different message\n\n**Error Type:** {type(e).__name__}"
|
| 130 |
user_msg = {"role": "user", "content": user_message}
|
| 131 |
assistant_msg = {"role": "assistant", "content": error_message}
|
| 132 |
new_history = chat_history + [user_msg, assistant_msg]
|
| 133 |
return new_history, new_history
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
def clear_chat():
|
| 136 |
"""Clear the chat history"""
|
| 137 |
return [], []
|
|
|
|
| 149 |
gr.Markdown("""
|
| 150 |
# 🛡️ Advanced Content Moderation Chatbot
|
| 151 |
|
| 152 |
+
This chatbot uses Groq's GPT-OSS-Safeguard-20B model to analyze content against a comprehensive harm taxonomy.
|
| 153 |
Enter your Groq API key and test content to see detailed moderation analysis.
|
| 154 |
""")
|
| 155 |
|
|
|
|
| 172 |
chatbot = gr.Chatbot(
|
| 173 |
label="Moderation Results",
|
| 174 |
height=450,
|
| 175 |
+
show_label=True,
|
| 176 |
+
type="messages"
|
| 177 |
)
|
| 178 |
|
| 179 |
with gr.Row():
|
|
|
|
| 272 |
|
| 273 |
### ℹ️ About This Application
|
| 274 |
|
| 275 |
+
This application demonstrates advanced content moderation using AI. The model analyzes text against **14 harm categories**:
|
| 276 |
|
| 277 |
| Category | Description |
|
| 278 |
|----------|-------------|
|
|
|
|
| 293 |
|
| 294 |
### 🎯 Key Features:
|
| 295 |
|
| 296 |
+
- ✅ **Direct Model Output**: Shows raw analysis from GPT-OSS-Safeguard-20B
|
| 297 |
+
- ✅ **Comprehensive Detection**: Identifies all applicable harm categories
|
| 298 |
+
- ✅ **Detailed Explanations**: Clear reasoning for each determination
|
|
|
|
| 299 |
- ✅ **15 Example Queries**: One safe example + one for each harm category
|
| 300 |
+
- ✅ **Clean Output**: Direct model response without extra formatting
|
| 301 |
|
| 302 |
### 🔒 Privacy & Security:
|
| 303 |
|
|
|
|
| 305 |
- All processing happens via Groq's secure API
|
| 306 |
- No content is logged or retained
|
| 307 |
|
| 308 |
+
### 🐛 Troubleshooting:
|
| 309 |
+
|
| 310 |
+
If you see empty responses:
|
| 311 |
+
1. Verify your API key is correct
|
| 312 |
+
2. Check your Groq account has available credits
|
| 313 |
+
3. Ensure the model `openai/gpt-oss-safeguard-20b` is accessible
|
| 314 |
+
4. Check the console/terminal for debug output
|
| 315 |
+
|
| 316 |
**Note:** This is a demonstration tool. Always implement appropriate safeguards and human review in production systems.
|
| 317 |
|
| 318 |
---
|