Spaces:
Sleeping
Sleeping
File size: 1,616 Bytes
b8300d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import os
import base64
import pandas as pd
from pathlib import Path
def export_badwords_to_excel(output_filename="custom_badwords_dataset.xlsx"):
data_dir = Path("data")
toxic_words = []
# 1. Load regular badwords
p1 = data_dir / "telugu_badwords.txt"
if p1.exists():
with open(p1, "r", encoding="utf-8") as f:
toxic_words.extend([l.strip() for l in f if l.strip()])
# 2. Load secure base64 badwords
p2 = data_dir / "secure_words.bin"
if p2.exists():
with open(p2, "rb") as f:
decoded = base64.b64decode(f.read()).decode("utf-8")
toxic_words.extend([l.strip() for l in decoded.splitlines() if l.strip()])
# 3. Load bad emojis
p3 = data_dir / "bad_emojis.txt"
if p3.exists():
with open(p3, "r", encoding="utf-8") as f:
toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
# Remove duplicates
toxic_words = list(set(toxic_words))
print(f"Total unique offensive terms gathered: {len(toxic_words)}")
if not toxic_words:
print("No words found to export.")
return
# Create a DataFrame
# Here we are just exporting the raw words as 'toxic'
df = pd.DataFrame({
'text': toxic_words,
'label': 'toxic'
})
# Save to Excel
output_path = data_dir / output_filename
df.to_excel(output_path, index=False)
print(f"Successfully exported {len(toxic_words)} words to {output_path}")
if __name__ == "__main__":
export_badwords_to_excel()
|