vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Apr 11, 2025

Commit

e2d86a3

1 Parent(s): e5c407e

Update datasets

Browse files

Files changed (7) hide show

api.py +1 -1
app.py +25 -0
datasets_voz_gemini/dev.csv +3 -0
datasets_voz_gemini/test.csv +3 -0
datasets_voz_gemini/train.csv +3 -0
scripts/merge_datasets.py +1 -1
utils/convert_vihsd_gemini.py +18 -5

api.py CHANGED Viewed

@@ -40,7 +40,7 @@ def load_model_lstm():
     model = model.to(device)
     return model, device
-def inference(model, device, comments: str | list, threshold: float = 0.55):
     if isinstance(comments, str):
         comments = [comments]
     elif not isinstance(comments, list):

     model = model.to(device)
     return model, device
+def inference(model, device, comments: str | list, threshold: float = 0.6):
     if isinstance(comments, str):
         comments = [comments]
     elif not isinstance(comments, list):

app.py CHANGED Viewed

@@ -1,6 +1,31 @@
 import streamlit as st
 from api import load_model_bert, load_model_lstm, inference
 import pandas as pd
 # Set up the Streamlit app
 def app():

 import streamlit as st
 from api import load_model_bert, load_model_lstm, inference
 import pandas as pd
+from huggingface_hub import hf_hub_download
+import os
+# Download the model files from Hugging Face Hub: https://huggingface.co/jesse-tong/vietnamese_hate_speech_detection_phobert
+# to vietnamese_hate_speech_detection_phobert directory
+if os.path.exists("vietnamese_hate_speech_detection_phobert") == False:
+    try:
+        os.mkdir("vietnamese_hate_speech_detection_phobert")
+    except FileExistsError:
+        pass
+    # Download the model files
+    hf_hub_download(
+        repo_id="jesse-tong/vietnamese_hate_speech_detection_phobert",
+        filename="vinai_phobert-base-v2_finetuned.pth",
+        repo_type="model",
+        local_dir="vietnamese_hate_speech_detection_phobert"
+    )
+    hf_hub_download(
+        repo_id="jesse-tong/vietnamese_hate_speech_detection_phobert",
+        filename="distilled_lstm_model.pth",
+        repo_type="model",
+        local_dir="vietnamese_hate_speech_detection_phobert"
+    )
 # Set up the Streamlit app
 def app():

datasets_voz_gemini/dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee6611003e74122c4ff00a33949c0be5d64485afe08968b150b55883fd2c9b6
+size 865240

datasets_voz_gemini/test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d29e8328cdb0f415c32981bfe008fb59fd1ad7a84cf4e1948e1f16855205f35
+size 1814883

datasets_voz_gemini/train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e30f5595d43adcc84998a31956b47343ccfb75b062fb8965ee2fc503b06111c6
+size 8002834

scripts/merge_datasets.py CHANGED Viewed

@@ -19,7 +19,7 @@ def merge_csv_files(directories, output_file, target_name):
     merged.to_csv(output_file, index=False)
 if __name__ == "__main__":
-    directories = ["../datasets_vithsd", "../datasets_vihsd_gemini"]
     merge_csv_files(directories, "../datasets/train.csv", "train.csv")
     merge_csv_files(directories, "../datasets/dev.csv", "dev.csv")
     merge_csv_files(directories, "../datasets/test.csv", "test.csv")

     merged.to_csv(output_file, index=False)
 if __name__ == "__main__":
+    directories = ["../datasets_vithsd", "../datasets_vihsd_gemini" , "../datasets_voz_gemini"]
     merge_csv_files(directories, "../datasets/train.csv", "train.csv")
     merge_csv_files(directories, "../datasets/dev.csv", "dev.csv")
     merge_csv_files(directories, "../datasets/test.csv", "test.csv")

utils/convert_vihsd_gemini.py CHANGED Viewed

@@ -12,7 +12,7 @@ def setup_genai(api_key):
     """Configure the Google Generative AI client with your API key"""
     return genai.Client(api_key=api_key)
-def classify_text(model, text):
     """Classify Vietnamese text into hate speech categories using Google's Generative AI"""
     prompt = f"""
     Analyze the following Vietnamese text for hate speech (each sentence is separated by a newline):
@@ -26,6 +26,8 @@ def classify_text(model, text):
     - politics (political hate speech)
     If the text doesn't specify a person or group in a category, return 0 for that category.
     Else, return 1 for CLEAN, 2 for OFFENSIVE, or 3 for HATE.
     For each sentence in the text, return only 5 numbers separated by commas (corresponding to the label of individual, groups, religion/creed, race/ethnicity, politics) and numbers for each sentence seperated by newlines, like (with no other text):
     0,1,0,0,0
@@ -42,7 +44,7 @@ def classify_text(model, text):
         print(f"Error classifying text: {e}")
         return None
-def process_file(input_file, output_file, model, rate_limit_pause=4, text_col="free_text"):
     """Process a single CSV file to match the test.csv format"""
     print(f"Processing {input_file}...")
@@ -66,6 +68,8 @@ def process_file(input_file, output_file, model, rate_limit_pause=4, text_col="f
         if col not in df.columns:
             # Change column type to int if it doesn't exist
             df[col] = 0
     # Process each batch (100 rows at a time)
     batch_size = 100
@@ -78,8 +82,16 @@ def process_file(input_file, output_file, model, rate_limit_pause=4, text_col="f
             continue
         # Join 50 rows by newlines, and classify all at once
-        text_to_classify = "\n".join([str(sentence) for sentence in batch_df['content'].tolist()])
-        classifications = classify_text(model, text_to_classify)
         # Try 2 more times, else skip
         if classifications is None:
@@ -135,6 +147,7 @@ def main():
     parser.add_argument("--output_dir", required=True, help="Directory to save processed files")
     parser.add_argument("--api_key", required=True, help="Google Generative AI API key")
     parser.add_argument("--pause", type=float, default=4.0, help="Pause between API calls (seconds)")
     args = parser.parse_args()
@@ -158,7 +171,7 @@ def main():
         if os.path.exists(output_file):
             print(f"Output file {output_file} already exists. Skipping...")
             continue
-        process_file(input_file, output_file, model, args.pause)
 if __name__ == "__main__":
     # This script is used to process ViHSD CSV files with Google Generative AI

     """Configure the Google Generative AI client with your API key"""
     return genai.Client(api_key=api_key)
+def classify_text(model, text, suggest_label=False):
     """Classify Vietnamese text into hate speech categories using Google's Generative AI"""
     prompt = f"""
     Analyze the following Vietnamese text for hate speech (each sentence is separated by a newline):
     - politics (political hate speech)
     If the text doesn't specify a person or group in a category, return 0 for that category.
     Else, return 1 for CLEAN, 2 for OFFENSIVE, or 3 for HATE.
+    {'The number at the end of the sentence (between <SuggestLabel> and </SuggestLabel> tags is the suggestion label for the sentence. (0 is normal/clean, 1 is offensive/hate in at least one category)' if suggest_label else ''}
     For each sentence in the text, return only 5 numbers separated by commas (corresponding to the label of individual, groups, religion/creed, race/ethnicity, politics) and numbers for each sentence seperated by newlines, like (with no other text):
     0,1,0,0,0
         print(f"Error classifying text: {e}")
         return None
+def process_file(input_file, output_file, model, rate_limit_pause=4, text_col="free_text", suggest_column="labels"):
     """Process a single CSV file to match the test.csv format"""
     print(f"Processing {input_file}...")
         if col not in df.columns:
             # Change column type to int if it doesn't exist
             df[col] = 0
+    print("Suggesting labels: ", 'True' if suggest_column in df.columns else 'False')
     # Process each batch (100 rows at a time)
     batch_size = 100
             continue
         # Join 50 rows by newlines, and classify all at once
+        batch_strings = [str(sentence) for sentence in batch_df['content'].tolist()]
+        suggest_label = False
+        if suggest_column in df.columns:
+            batch_strings = [str(sentence) + " " + f"<SuggestLabel>{str(label)}</SuggestLabel>" for sentence, label in zip(batch_strings, batch_df[suggest_column].tolist())]
+            suggest_label = True
+        text_to_classify = "\n".join(batch_strings)
+        classifications = classify_text(model, text_to_classify, suggest_label=suggest_label)
         # Try 2 more times, else skip
         if classifications is None:
     parser.add_argument("--output_dir", required=True, help="Directory to save processed files")
     parser.add_argument("--api_key", required=True, help="Google Generative AI API key")
     parser.add_argument("--pause", type=float, default=4.0, help="Pause between API calls (seconds)")
+    parser.add_argument("--text_col", default="free_text", help="Column name for text content in input CSV files")
     args = parser.parse_args()
         if os.path.exists(output_file):
             print(f"Output file {output_file} already exists. Skipping...")
             continue
+        process_file(input_file, output_file, model, args.pause, text_col=args.text_col)
 if __name__ == "__main__":
     # This script is used to process ViHSD CSV files with Google Generative AI