shukdevdattaEX commited on
Commit
77e7c24
·
verified ·
1 Parent(s): c41a16d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -55
app.py CHANGED
@@ -39,41 +39,25 @@ HARM_DESCRIPTIONS = {
39
 
40
  # System prompt for enhanced moderation
41
  SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
42
-
43
  S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
44
-
45
  S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
46
-
47
  S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
48
-
49
  S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
50
-
51
  S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
52
-
53
  S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
54
-
55
  S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
56
-
57
  S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
58
-
59
  S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
60
-
61
  S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
62
-
63
  S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
64
-
65
  S12) Sexual Content: Content that contains erotica.
66
-
67
  S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
68
-
69
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
70
-
71
  For each piece of content, provide:
72
  1. A clear SAFE or UNSAFE determination
73
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
74
  3. A brief explanation of why the content violates each flagged category
75
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
76
-
77
  Be thorough, objective, and explain your reasoning clearly."""
78
 
79
  def moderate_content(api_key, user_message, chat_history):
@@ -112,7 +96,7 @@ def moderate_content(api_key, user_message, chat_history):
112
  # Get the response
113
  moderation_result = chat_completion.choices[0].message.content
114
 
115
- # Parse and format the response
116
  formatted_response = format_moderation_response(moderation_result, user_message)
117
 
118
  # Update chat history with proper message format
@@ -131,44 +115,11 @@ def moderate_content(api_key, user_message, chat_history):
131
 
132
  def format_moderation_response(result, original_content):
133
  """
134
- Format the moderation result into a readable response
135
  """
136
  try:
137
- result_lower = result.lower()
138
-
139
- # Build formatted response
140
- response = "## 🛡️ Content Moderation Result\n\n"
141
- response += f"**Analyzed Content:** _{original_content[:100]}{'...' if len(original_content) > 100 else ''}_\n\n"
142
-
143
- # Check if content appears safe
144
- if "safe" in result_lower and ("unsafe" not in result_lower or result_lower.index("safe") < result_lower.index("unsafe")):
145
- response += "### ✅ Status: SAFE\n\n"
146
- response += "The content appears to be appropriate and does not violate any harm policies.\n\n"
147
- else:
148
- response += "### ⚠️ Status: FLAGGED\n\n"
149
-
150
- # Check for severity
151
- if "critical" in result_lower:
152
- response += "**Severity:** 🔴 CRITICAL\n\n"
153
- elif "high" in result_lower:
154
- response += "**Severity:** 🟠 HIGH\n\n"
155
- elif "medium" in result_lower:
156
- response += "**Severity:** 🟡 MEDIUM\n\n"
157
- elif "low" in result_lower:
158
- response += "**Severity:** 🟢 LOW\n\n"
159
-
160
- # Check for specific categories
161
- flagged_categories = []
162
- for code, category in HARM_CATEGORIES.items():
163
- if code.lower() in result_lower or code in result:
164
- flagged_categories.append(f"- **{code}: {category}** - {HARM_DESCRIPTIONS[code]}")
165
-
166
- if flagged_categories:
167
- response += "**Flagged Categories:**\n" + "\n".join(flagged_categories) + "\n\n"
168
-
169
- # Add detailed analysis
170
- response += "---\n\n### 📊 Detailed Analysis:\n\n" + result
171
-
172
  return response
173
 
174
  except Exception as e:
@@ -214,7 +165,8 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
214
  chatbot = gr.Chatbot(
215
  label="Moderation Results",
216
  height=450,
217
- show_label=True
 
218
  )
219
 
220
  with gr.Row():
@@ -339,6 +291,7 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
339
  - ✅ **Category Detection**: Identifies all applicable harm categories
340
  - ✅ **Detailed Explanations**: Clear reasoning for each flag
341
  - ✅ **15 Example Queries**: One safe example + one for each harm category
 
342
 
343
  ### 🔒 Privacy & Security:
344
 
@@ -355,4 +308,4 @@ with gr.Blocks(title="Content Moderation Chatbot", theme=gr.themes.Soft()) as ap
355
 
356
  # Launch the app
357
  if __name__ == "__main__":
358
- app.launch(share=False, server_name="0.0.0.0", server_port=7860)
 
39
 
40
  # System prompt for enhanced moderation
41
  SYSTEM_PROMPT = """You are an expert content moderation AI assistant. Your role is to analyze text content and identify potential policy violations according to the following Harm Taxonomy:
 
42
  S1) Violent Crimes: Content that enables, encourages, or endorses unlawful violence toward people (terrorism, genocide, murder, hate-motivated violent crimes, child abuse, assault, battery, kidnapping) or animals (animal abuse).
 
43
  S2) Non-Violent Crimes: Content that enables, encourages, or endorses personal crimes (labor trafficking, threats, intimidation), financial crimes (fraud, scams, money laundering), property crimes (theft, arson, vandalism), drug crimes (creating or using regulated substances), weapons crimes (producing unlicensed firearms), or cyber crimes (hacking).
 
44
  S3) Sex-Related Crimes: Content that enables, encourages, or endorses sex trafficking, sexual assault (rape), sexual harassment (physical, verbal, or visual), or prostitution.
 
45
  S4) Child Sexual Exploitation: Content that contains, describes, enables, encourages, or endorses the sexual abuse of children.
 
46
  S5) Defamation: Content that is both verifiably false and likely to injure a living person's reputation.
 
47
  S6) Specialized Advice: Content containing specialized financial, medical, or legal advice, or that indicates dangerous activities or objects are safe.
 
48
  S7) Privacy: Content containing sensitive, nonpublic personal information that could undermine someone's physical, digital, or financial security.
 
49
  S8) Intellectual Property: Content that may violate the intellectual property rights of any third party.
 
50
  S9) Indiscriminate Weapons: Content that enables, encourages, or endorses the creation of chemical weapons (nerve gas), biological weapons (anthrax), radiological weapons (cobalt bombs), nuclear weapons (fission bombs), or high-yield explosive weapons (cluster munitions).
 
51
  S10) Hate: Content that demeans or dehumanizes people on the basis of race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity, or serious disease.
 
52
  S11) Suicide & Self-Harm: Content that enables, encourages, or endorses acts of intentional self-harm including suicide, self-injury (cutting), or disordered eating.
 
53
  S12) Sexual Content: Content that contains erotica.
 
54
  S13) Elections: Content containing factually incorrect information about electoral systems and processes, including the time, place, or manner of voting in civic elections.
 
55
  S14) Code Interpreter Abuse: Content that seeks to abuse code interpreters, including those that enable denial of service attacks, container escapes, or privilege escalation exploits.
 
56
  For each piece of content, provide:
57
  1. A clear SAFE or UNSAFE determination
58
  2. If UNSAFE, list ALL applicable category codes (S1-S14)
59
  3. A brief explanation of why the content violates each flagged category
60
  4. Severity level: LOW, MEDIUM, HIGH, or CRITICAL
 
61
  Be thorough, objective, and explain your reasoning clearly."""
62
 
63
  def moderate_content(api_key, user_message, chat_history):
 
96
  # Get the response
97
  moderation_result = chat_completion.choices[0].message.content
98
 
99
+ # Parse and format the response (only detailed analysis)
100
  formatted_response = format_moderation_response(moderation_result, user_message)
101
 
102
  # Update chat history with proper message format
 
115
 
116
  def format_moderation_response(result, original_content):
117
  """
118
+ Format the moderation result - only show detailed analysis
119
  """
120
  try:
121
+ # Simply return the detailed analysis with a header
122
+ response = "### 📊 Detailed Analysis:\n\n" + result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return response
124
 
125
  except Exception as e:
 
165
  chatbot = gr.Chatbot(
166
  label="Moderation Results",
167
  height=450,
168
+ show_label=True,
169
+ type="messages"
170
  )
171
 
172
  with gr.Row():
 
291
  - ✅ **Category Detection**: Identifies all applicable harm categories
292
  - ✅ **Detailed Explanations**: Clear reasoning for each flag
293
  - ✅ **15 Example Queries**: One safe example + one for each harm category
294
+ - ✅ **Clean Output**: Shows only the detailed analysis from the model
295
 
296
  ### 🔒 Privacy & Security:
297
 
 
308
 
309
  # Launch the app
310
  if __name__ == "__main__":
311
+ app.launch(share=True)