Spaces:
Configuration error
Configuration error
| """ | |
| Quality Filters Module | |
| ======================== | |
| Filter samples by word count, profanity, language, | |
| and low-quality response detection. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import List, Optional | |
| import re | |
| import pandas as pd | |
| class QualityFilterConfig: | |
| """Configuration for quality filters.""" | |
| min_word_count: int = 0 | |
| max_word_count: int = 0 # 0 = no limit | |
| profanity_filter: bool = False | |
| language_filter: bool = False | |
| allowed_languages: List[str] = field(default_factory=lambda: ["en"]) | |
| remove_low_quality: bool = False | |
| min_quality_length: int = 20 | |
| # --------------------------------------------------------------------------- | |
| # Profanity word list (small built-in set, extend as needed) | |
| # --------------------------------------------------------------------------- | |
| _PROFANITY_WORDS = { | |
| 'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard', 'crap', | |
| 'dick', 'piss', 'slut', 'whore', 'cock', | |
| } | |
| # Generic filler/placeholder responses that indicate low quality | |
| _GENERIC_RESPONSES = [ | |
| "i don't know", | |
| "i am not sure", | |
| "no comment", | |
| "n/a", | |
| "none", | |
| "null", | |
| "test", | |
| "asdf", | |
| "lorem ipsum", | |
| "placeholder", | |
| "todo", | |
| "tbd", | |
| ] | |
| def _word_count(text: str) -> int: | |
| """Count words in a text string.""" | |
| if not isinstance(text, str): | |
| return 0 | |
| return len(text.split()) | |
| def filter_by_word_count( | |
| df: pd.DataFrame, | |
| col: str, | |
| min_words: int = 0, | |
| max_words: int = 0, | |
| ) -> pd.DataFrame: | |
| """Filter rows by word count in the given column.""" | |
| df = df.copy() | |
| counts = df[col].apply(_word_count) | |
| if min_words > 0: | |
| df = df[counts >= min_words] | |
| counts = counts[df.index] | |
| if max_words > 0: | |
| df = df[counts <= max_words] | |
| return df.reset_index(drop=True) | |
| def contains_profanity(text: str) -> bool: | |
| """Check if text contains any profanity words.""" | |
| if not isinstance(text, str): | |
| return False | |
| words = set(re.findall(r'\b\w+\b', text.lower())) | |
| return bool(words & _PROFANITY_WORDS) | |
| def filter_profanity( | |
| df: pd.DataFrame, | |
| col: str, | |
| ) -> pd.DataFrame: | |
| """Remove rows containing profanity in the given column.""" | |
| mask = ~df[col].apply(contains_profanity) | |
| return df[mask].reset_index(drop=True) | |
| def detect_language(text: str) -> str: | |
| """ | |
| Detect the language of a text string. | |
| Returns ISO 639-1 code (e.g., 'en', 'fr', 'de'). | |
| Falls back to 'unknown' if detection fails. | |
| """ | |
| try: | |
| from langdetect import detect | |
| if not isinstance(text, str) or len(text.strip()) < 10: | |
| return 'unknown' | |
| return detect(text) | |
| except ImportError: | |
| return 'unknown' | |
| except Exception: | |
| return 'unknown' | |
| def filter_by_language( | |
| df: pd.DataFrame, | |
| col: str, | |
| allowed_langs: List[str] = None, | |
| ) -> pd.DataFrame: | |
| """Keep only rows where the text is in one of the allowed languages.""" | |
| if allowed_langs is None: | |
| allowed_langs = ['en'] | |
| langs = df[col].apply(detect_language) | |
| mask = langs.isin(allowed_langs) | (langs == 'unknown') | |
| return df[mask].reset_index(drop=True) | |
| def is_low_quality(text: str, min_len: int = 20) -> bool: | |
| """ | |
| Check if a response is low-quality: | |
| - Too short | |
| - Matches generic/placeholder patterns | |
| """ | |
| if not isinstance(text, str): | |
| return True | |
| text_stripped = text.strip() | |
| if len(text_stripped) < min_len: | |
| return True | |
| text_lower = text_stripped.lower() | |
| for phrase in _GENERIC_RESPONSES: | |
| if text_lower == phrase or text_lower.startswith(phrase): | |
| return True | |
| return False | |
| def filter_low_quality( | |
| df: pd.DataFrame, | |
| col: str, | |
| min_len: int = 20, | |
| ) -> pd.DataFrame: | |
| """Remove low-quality responses.""" | |
| mask = ~df[col].apply(lambda t: is_low_quality(t, min_len)) | |
| return df[mask].reset_index(drop=True) | |
| def apply_quality_filters( | |
| df: pd.DataFrame, | |
| col: str, | |
| config: QualityFilterConfig, | |
| ) -> pd.DataFrame: | |
| """Apply all enabled quality filters to a DataFrame.""" | |
| if config.min_word_count > 0 or config.max_word_count > 0: | |
| df = filter_by_word_count(df, col, config.min_word_count, config.max_word_count) | |
| if config.profanity_filter: | |
| df = filter_profanity(df, col) | |
| if config.language_filter: | |
| df = filter_by_language(df, col, config.allowed_languages) | |
| if config.remove_low_quality: | |
| df = filter_low_quality(df, col, config.min_quality_length) | |
| return df | |