Spaces:

aneeb15
/

Auto-FineTune-Ops

Configuration error

App Files Files Community

Auto-FineTune-Ops / preprocessing /text_cleaning.py

aneeb15

Initial release of Auto-FineTune-Ops

d4398e6 4 months ago

raw

history blame contribute delete

3.64 kB

	"""
	Text Cleaning Module
	=====================
	Pure functions for text preprocessing toggles.
	Each function operates on a single string and can be
	composed via apply_text_cleaning().
	"""

	import re
	import unicodedata
	from dataclasses import dataclass
	from typing import List
	import pandas as pd


	@dataclass
	class TextCleaningConfig:
	"""Configuration for text cleaning options."""
	remove_html: bool = False
	remove_urls: bool = False
	remove_emojis: bool = False
	normalize_whitespace: bool = True
	lowercase: bool = False
	remove_special_chars: bool = False
	strip_extra_linebreaks: bool = True


	# ---------------------------------------------------------------------------
	# Individual cleaning functions
	# ---------------------------------------------------------------------------

	def remove_html_tags(text: str) -> str:
	"""Strip all HTML tags from text."""
	return re.sub(r'<[^>]+>', '', text)


	def remove_urls(text: str) -> str:
	"""Remove URLs (http, https, ftp, www) from text."""
	return re.sub(
	r'https?://\S+\|ftp://\S+\|www\.\S+',
	'', text
	)


	_EMOJI_PATTERN = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map symbols
	"\U0001F1E0-\U0001F1FF" # flags
	"\U00002702-\U000027B0"
	"\U000024C2-\U0001F251"
	"\U0001F900-\U0001F9FF" # supplemental symbols
	"\U0001FA00-\U0001FA6F"
	"\U0001FA70-\U0001FAFF"
	"\U00002702-\U000027B0"
	"]+",
	flags=re.UNICODE,
	)


	def remove_emojis(text: str) -> str:
	"""Remove emoji characters from text."""
	return _EMOJI_PATTERN.sub('', text)


	def normalize_whitespace(text: str) -> str:
	"""Collapse multiple spaces/tabs into a single space."""
	return re.sub(r'[^\S\n]+', ' ', text).strip()


	def to_lowercase(text: str) -> str:
	"""Convert text to lowercase."""
	return text.lower()


	def remove_special_characters(text: str) -> str:
	"""Keep only alphanumeric, basic punctuation, and whitespace."""
	return re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"()\-\n]', '', text)


	def strip_extra_linebreaks(text: str) -> str:
	"""Reduce three or more consecutive newlines to two."""
	return re.sub(r'\n{3,}', '\n\n', text)


	# ---------------------------------------------------------------------------
	# Composed cleaner
	# ---------------------------------------------------------------------------

	def clean_text(text: str, config: TextCleaningConfig) -> str:
	"""Apply all enabled cleaning steps to a single text string."""
	if not isinstance(text, str):
	return str(text) if text else ''

	if config.remove_html:
	text = remove_html_tags(text)
	if config.remove_urls:
	text = remove_urls(text)
	if config.remove_emojis:
	text = remove_emojis(text)
	if config.remove_special_chars:
	text = remove_special_characters(text)
	if config.lowercase:
	text = to_lowercase(text)
	if config.normalize_whitespace:
	text = normalize_whitespace(text)
	if config.strip_extra_linebreaks:
	text = strip_extra_linebreaks(text)

	return text


	def apply_text_cleaning(
	df: pd.DataFrame,
	columns: List[str],
	config: TextCleaningConfig,
	) -> pd.DataFrame:
	"""Apply text cleaning to specified columns of a DataFrame."""
	df = df.copy()
	for col in columns:
	if col in df.columns:
	df[col] = df[col].apply(lambda t: clean_text(t, config))
	return df