File size: 4,734 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""

Quality Filters Module

========================

Filter samples by word count, profanity, language,

and low-quality response detection.

"""

from dataclasses import dataclass, field
from typing import List, Optional
import re
import pandas as pd


@dataclass
class QualityFilterConfig:
    """Configuration for quality filters."""
    min_word_count: int = 0
    max_word_count: int = 0          # 0 = no limit
    profanity_filter: bool = False
    language_filter: bool = False
    allowed_languages: List[str] = field(default_factory=lambda: ["en"])
    remove_low_quality: bool = False
    min_quality_length: int = 20


# ---------------------------------------------------------------------------
# Profanity word list (small built-in set, extend as needed)
# ---------------------------------------------------------------------------
_PROFANITY_WORDS = {
    'fuck', 'shit', 'damn', 'ass', 'bitch', 'bastard', 'crap',
    'dick', 'piss', 'slut', 'whore', 'cock',
}

# Generic filler/placeholder responses that indicate low quality
_GENERIC_RESPONSES = [
    "i don't know",
    "i am not sure",
    "no comment",
    "n/a",
    "none",
    "null",
    "test",
    "asdf",
    "lorem ipsum",
    "placeholder",
    "todo",
    "tbd",
]


def _word_count(text: str) -> int:
    """Count words in a text string."""
    if not isinstance(text, str):
        return 0
    return len(text.split())


def filter_by_word_count(

    df: pd.DataFrame,

    col: str,

    min_words: int = 0,

    max_words: int = 0,

) -> pd.DataFrame:
    """Filter rows by word count in the given column."""
    df = df.copy()
    counts = df[col].apply(_word_count)

    if min_words > 0:
        df = df[counts >= min_words]
        counts = counts[df.index]

    if max_words > 0:
        df = df[counts <= max_words]

    return df.reset_index(drop=True)


def contains_profanity(text: str) -> bool:
    """Check if text contains any profanity words."""
    if not isinstance(text, str):
        return False
    words = set(re.findall(r'\b\w+\b', text.lower()))
    return bool(words & _PROFANITY_WORDS)


def filter_profanity(

    df: pd.DataFrame,

    col: str,

) -> pd.DataFrame:
    """Remove rows containing profanity in the given column."""
    mask = ~df[col].apply(contains_profanity)
    return df[mask].reset_index(drop=True)


def detect_language(text: str) -> str:
    """

    Detect the language of a text string.

    Returns ISO 639-1 code (e.g., 'en', 'fr', 'de').

    Falls back to 'unknown' if detection fails.

    """
    try:
        from langdetect import detect
        if not isinstance(text, str) or len(text.strip()) < 10:
            return 'unknown'
        return detect(text)
    except ImportError:
        return 'unknown'
    except Exception:
        return 'unknown'


def filter_by_language(

    df: pd.DataFrame,

    col: str,

    allowed_langs: List[str] = None,

) -> pd.DataFrame:
    """Keep only rows where the text is in one of the allowed languages."""
    if allowed_langs is None:
        allowed_langs = ['en']

    langs = df[col].apply(detect_language)
    mask = langs.isin(allowed_langs) | (langs == 'unknown')
    return df[mask].reset_index(drop=True)


def is_low_quality(text: str, min_len: int = 20) -> bool:
    """

    Check if a response is low-quality:

    - Too short

    - Matches generic/placeholder patterns

    """
    if not isinstance(text, str):
        return True
    text_stripped = text.strip()
    if len(text_stripped) < min_len:
        return True
    text_lower = text_stripped.lower()
    for phrase in _GENERIC_RESPONSES:
        if text_lower == phrase or text_lower.startswith(phrase):
            return True
    return False


def filter_low_quality(

    df: pd.DataFrame,

    col: str,

    min_len: int = 20,

) -> pd.DataFrame:
    """Remove low-quality responses."""
    mask = ~df[col].apply(lambda t: is_low_quality(t, min_len))
    return df[mask].reset_index(drop=True)


def apply_quality_filters(

    df: pd.DataFrame,

    col: str,

    config: QualityFilterConfig,

) -> pd.DataFrame:
    """Apply all enabled quality filters to a DataFrame."""
    if config.min_word_count > 0 or config.max_word_count > 0:
        df = filter_by_word_count(df, col, config.min_word_count, config.max_word_count)

    if config.profanity_filter:
        df = filter_profanity(df, col)

    if config.language_filter:
        df = filter_by_language(df, col, config.allowed_languages)

    if config.remove_low_quality:
        df = filter_low_quality(df, col, config.min_quality_length)

    return df