Spaces:

alohaboy
/

hate-speech-mitigation-demo

Sleeping

App Files Files Community

alohaboy commited on Jul 29, 2025

Commit

2185d4b

1 Parent(s): 449e8c9

Fix indentation error in guided mitigation methods

Browse files

Files changed (1) hide show

app.py +0 -18

app.py CHANGED Viewed

@@ -404,15 +404,6 @@ Mitigated sentence:"""
             if hate_tokens:
                 hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
             prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., 씨발, 좆, 병신) must be deleted.\n\nMitigated sentence:"""
-                label_desc = {
-                    "offensive": "Aggressive",
-                    "L1_hate": "Mild Hate",
-                    "L2_hate": "Severe Hate"
-                }
-                hate_tokens_str = ""
-                if hate_tokens:
-                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
-                prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., 씨발, 좆, 병신) must be deleted.\n\nMitigated sentence:"""
             # LLM inference
             inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
@@ -488,15 +479,6 @@ Mitigated sentence:"""
             if hate_tokens:
                 hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
             initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., 좃, 씨발, 병신) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:"""
-                label_desc = {
-                    "offensive": "Aggressive",
-                    "L1_hate": "Mild Hate",
-                    "L2_hate": "Severe Hate"
-                }
-                hate_tokens_str = ""
-                if hate_tokens:
-                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
-                initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., 좃, 씨발, 병신) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:"""
             # Iterative mitigation and evaluation
             max_iter = 3  # Reduced from 5 to 3 for Space deployment
             metrics_history = []

             if hate_tokens:
                 hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
             prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., 씨발, 좆, 병신) must be deleted.\n\nMitigated sentence:"""
             # LLM inference
             inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
             with torch.no_grad():
             if hate_tokens:
                 hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
             initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., 좃, 씨발, 병신) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:"""
             # Iterative mitigation and evaluation
             max_iter = 3  # Reduced from 5 to 3 for Space deployment
             metrics_history = []