Deeptanshuu
/

Multilingual_Toxic_Comment_Classifier

Text Classification

Model card Files Files and versions

Multilingual_Toxic_Comment_Classifier / utils /clean_text.py

Deeptanshuu's picture

Upload folder using huggingface_hub

d187b57 verified about 1 year ago

4.26 kB

	import pandas as pd
	import re
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import logging
	from pathlib import Path

	def clean_text(text):
	"""Clean text by removing URLs, HTML tags, and special characters"""
	try:
	# Convert to string if not already
	text = str(text)

	# Remove URLs
	text = re.sub(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

	# Remove HTML tags
	text = BeautifulSoup(text, "html.parser").get_text()

	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?-]', ' ', text)

	# Remove extra whitespace
	text = ' '.join(text.split())

	# Remove multiple punctuation
	text = re.sub(r'([.,!?])\1+', r'\1', text)

	# Remove spaces before punctuation
	text = re.sub(r'\s+([.,!?])', r'\1', text)

	return text.strip()
	except Exception as e:
	logging.error(f"Error cleaning text: {str(e)}")
	return text

	def try_read_csv(file_path):
	"""Try different encodings to read the CSV file"""
	encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

	for encoding in encodings:
	try:
	print(f"Trying {encoding} encoding...")
	return pd.read_csv(file_path, encoding=encoding)
	except UnicodeDecodeError:
	continue
	except Exception as e:
	print(f"Error with {encoding}: {str(e)}")
	continue

	raise ValueError("Could not read file with any of the attempted encodings")

	def clean_dataset(input_path, output_path=None):
	"""Clean comment text in a dataset"""
	print(f"\nReading input file: {input_path}")

	# If no output path specified, use input name with _cleaned suffix
	if output_path is None:
	output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_cleaned.csv"))

	try:
	# Try reading with different encodings
	df = try_read_csv(input_path)
	total_rows = len(df)

	print(f"\nDataset Info:")
	print(f"Initial Rows: {total_rows:,}")
	print(f"Columns: {', '.join(df.columns)}")

	# Verify 'comment_text' column exists
	if 'comment_text' not in df.columns:
	# Try to find a column that might contain the comments
	text_columns = [col for col in df.columns if 'text' in col.lower() or 'comment' in col.lower()]
	if text_columns:
	print(f"\nUsing '{text_columns[0]}' as comment column")
	df['comment_text'] = df[text_columns[0]]
	else:
	raise ValueError("Could not find comment text column")

	# Clean comment text with progress bar
	print("\nCleaning comments...")
	tqdm.pandas()
	df['comment_text'] = df['comment_text'].progress_apply(clean_text)

	# Remove empty comments
	non_empty_mask = df['comment_text'].str.strip().str.len() > 0
	df = df[non_empty_mask]

	# Save cleaned dataset
	print(f"\nSaving to: {output_path}")
	df.to_csv(output_path, index=False, encoding='utf-8')

	# Print statistics
	print(f"\n✓ Successfully cleaned comments")
	print(f"Initial rows: {total_rows:,}")
	print(f"Final rows: {len(df):,}")
	print(f"Removed empty rows: {total_rows - len(df):,}")
	print(f"Output file: {output_path}")
	print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB")

	# Sample of cleaned comments
	print("\nSample of cleaned comments:")
	for i, (orig, cleaned) in enumerate(zip(df['comment_text'].head(3), df['comment_text'].head(3))):
	print(f"\nExample {i+1}:")
	print(f"Original : {orig[:100]}...")
	print(f"Cleaned : {cleaned[:100]}...")

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	return

	if __name__ == "__main__":
	input_path = "dataset/raw/english-trash.csv"
	output_path = "dataset/raw/english-comments-cleaned.csv"

	clean_dataset(input_path, output_path)