Spaces:
Configuration error
Configuration error
File size: 4,734 Bytes
d4398e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """
Quality Filters Module
========================
Filter samples by word count, profanity, language,
and low-quality response detection.
"""
from dataclasses import dataclass, field
from typing import List, Optional
import re
import pandas as pd
@dataclass
class QualityFilterConfig:
"""Configuration for quality filters."""
min_word_count: int = 0
max_word_count: int = 0 # 0 = no limit
profanity_filter: bool = False
language_filter: bool = False
allowed_languages: List[str] = field(default_factory=lambda: ["en"])
remove_low_quality: bool = False
min_quality_length: int = 20
# ---------------------------------------------------------------------------
# Profanity word list (small built-in set, extend as needed)
# ---------------------------------------------------------------------------
_PROFANITY_WORDS = {
'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard', 'crap',
'dick', 'piss', 'slut', 'whore', 'cock',
}
# Generic filler/placeholder responses that indicate low quality
_GENERIC_RESPONSES = [
"i don't know",
"i am not sure",
"no comment",
"n/a",
"none",
"null",
"test",
"asdf",
"lorem ipsum",
"placeholder",
"todo",
"tbd",
]
def _word_count(text: str) -> int:
"""Count words in a text string."""
if not isinstance(text, str):
return 0
return len(text.split())
def filter_by_word_count(
df: pd.DataFrame,
col: str,
min_words: int = 0,
max_words: int = 0,
) -> pd.DataFrame:
"""Filter rows by word count in the given column."""
df = df.copy()
counts = df[col].apply(_word_count)
if min_words > 0:
df = df[counts >= min_words]
counts = counts[df.index]
if max_words > 0:
df = df[counts <= max_words]
return df.reset_index(drop=True)
def contains_profanity(text: str) -> bool:
"""Check if text contains any profanity words."""
if not isinstance(text, str):
return False
words = set(re.findall(r'\b\w+\b', text.lower()))
return bool(words & _PROFANITY_WORDS)
def filter_profanity(
df: pd.DataFrame,
col: str,
) -> pd.DataFrame:
"""Remove rows containing profanity in the given column."""
mask = ~df[col].apply(contains_profanity)
return df[mask].reset_index(drop=True)
def detect_language(text: str) -> str:
"""
Detect the language of a text string.
Returns ISO 639-1 code (e.g., 'en', 'fr', 'de').
Falls back to 'unknown' if detection fails.
"""
try:
from langdetect import detect
if not isinstance(text, str) or len(text.strip()) < 10:
return 'unknown'
return detect(text)
except ImportError:
return 'unknown'
except Exception:
return 'unknown'
def filter_by_language(
df: pd.DataFrame,
col: str,
allowed_langs: List[str] = None,
) -> pd.DataFrame:
"""Keep only rows where the text is in one of the allowed languages."""
if allowed_langs is None:
allowed_langs = ['en']
langs = df[col].apply(detect_language)
mask = langs.isin(allowed_langs) | (langs == 'unknown')
return df[mask].reset_index(drop=True)
def is_low_quality(text: str, min_len: int = 20) -> bool:
"""
Check if a response is low-quality:
- Too short
- Matches generic/placeholder patterns
"""
if not isinstance(text, str):
return True
text_stripped = text.strip()
if len(text_stripped) < min_len:
return True
text_lower = text_stripped.lower()
for phrase in _GENERIC_RESPONSES:
if text_lower == phrase or text_lower.startswith(phrase):
return True
return False
def filter_low_quality(
df: pd.DataFrame,
col: str,
min_len: int = 20,
) -> pd.DataFrame:
"""Remove low-quality responses."""
mask = ~df[col].apply(lambda t: is_low_quality(t, min_len))
return df[mask].reset_index(drop=True)
def apply_quality_filters(
df: pd.DataFrame,
col: str,
config: QualityFilterConfig,
) -> pd.DataFrame:
"""Apply all enabled quality filters to a DataFrame."""
if config.min_word_count > 0 or config.max_word_count > 0:
df = filter_by_word_count(df, col, config.min_word_count, config.max_word_count)
if config.profanity_filter:
df = filter_profanity(df, col)
if config.language_filter:
df = filter_by_language(df, col, config.allowed_languages)
if config.remove_low_quality:
df = filter_low_quality(df, col, config.min_quality_length)
return df
|