Spaces:
Sleeping
Sleeping
Update src/predict.py
Browse files- src/predict.py +65 -10
src/predict.py
CHANGED
|
@@ -123,12 +123,68 @@ def preprocess_text(text: str) -> str:
|
|
| 123 |
def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
| 124 |
"""Compute keyword-based score for enhanced accuracy"""
|
| 125 |
text_lower = text.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
high_risk_matches = sum(1 for kw in HIGH_RISK_KEYWORDS if kw in text_lower)
|
| 130 |
|
| 131 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
context_patterns = [
|
| 133 |
r'(?i)(pick.*up|got.*stuff|meet.*behind)',
|
| 134 |
r'(?i)(payment|crypto|cash.*deal)',
|
|
@@ -137,10 +193,8 @@ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
|
| 137 |
r'(?i)(quality.*good|pure.*stuff)',
|
| 138 |
r'(?i)(cops.*around|too.*risky)'
|
| 139 |
]
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
# Enhanced scoring with weights
|
| 144 |
keyword_score = 0.0
|
| 145 |
if high_risk_matches > 0:
|
| 146 |
keyword_score += min(high_risk_matches * 0.3, 0.8)
|
|
@@ -148,15 +202,16 @@ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
|
| 148 |
keyword_score += min(drug_matches * 0.1, 0.3)
|
| 149 |
if context_matches > 0:
|
| 150 |
keyword_score += min(context_matches * 0.15, 0.4)
|
| 151 |
-
|
| 152 |
keyword_score = min(keyword_score, 1.0)
|
| 153 |
-
|
| 154 |
return keyword_score, {
|
| 155 |
'drug_keywords': drug_matches,
|
| 156 |
'high_risk_keywords': high_risk_matches,
|
| 157 |
'context_patterns': context_matches
|
| 158 |
}
|
| 159 |
|
|
|
|
| 160 |
# =======================
|
| 161 |
# Config validation/fix with enhanced error handling
|
| 162 |
# =======================
|
|
|
|
| 123 |
def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
| 124 |
"""Compute keyword-based score for enhanced accuracy"""
|
| 125 |
text_lower = text.lower()
|
| 126 |
+
|
| 127 |
+
# =======================
|
| 128 |
+
# Enhanced text preprocessing for better accuracy
|
| 129 |
+
# =======================
|
| 130 |
+
def preprocess_text(text: str) -> str:
|
| 131 |
+
"""Enhanced text preprocessing for better model accuracy"""
|
| 132 |
+
if not text:
|
| 133 |
+
return ""
|
| 134 |
+
|
| 135 |
+
# Convert to lowercase
|
| 136 |
+
text = text.lower()
|
| 137 |
+
|
| 138 |
+
# Remove excessive whitespace but preserve sentence structure
|
| 139 |
+
text = re.sub(r'\s+', ' ', text)
|
| 140 |
+
|
| 141 |
+
# Handle common abbreviations and slang normalization
|
| 142 |
+
abbreviations = {
|
| 143 |
+
'u': 'you',
|
| 144 |
+
'ur': 'your',
|
| 145 |
+
'n': 'and',
|
| 146 |
+
'w/': 'with',
|
| 147 |
+
'thru': 'through',
|
| 148 |
+
'gonna': 'going to',
|
| 149 |
+
'wanna': 'want to',
|
| 150 |
+
'gotta': 'got to'
|
| 151 |
+
}
|
| 152 |
|
| 153 |
+
for abbrev, full in abbreviations.items():
|
| 154 |
+
text = re.sub(rf'\b{re.escape(abbrev)}\b', full, text)
|
|
|
|
| 155 |
|
| 156 |
+
# Remove excessive punctuation but keep sentence boundaries
|
| 157 |
+
text = re.sub(r'[!]{2,}', '!', text)
|
| 158 |
+
text = re.sub(r'[?]{2,}', '?', text)
|
| 159 |
+
text = re.sub(r'[.]{3,}', '...', text)
|
| 160 |
+
|
| 161 |
+
return text.strip()
|
| 162 |
+
|
| 163 |
+
# =======================
|
| 164 |
+
# Enhanced keyword-based scoring
|
| 165 |
+
# =======================
|
| 166 |
+
def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
| 167 |
+
"""Compute keyword-based score for enhanced accuracy"""
|
| 168 |
+
text_lower = text.lower()
|
| 169 |
+
|
| 170 |
+
AMBIGUOUS_TERMS = {"e", "x", "line", "ice", "horse", "420"}
|
| 171 |
+
|
| 172 |
+
def keyword_check_with_context(text: str, kw: str) -> bool:
|
| 173 |
+
pattern = rf"\b{re.escape(kw)}\b"
|
| 174 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 175 |
+
if kw in AMBIGUOUS_TERMS:
|
| 176 |
+
context_pattern = r"\b(smoke|roll|pop|hit|take|buy|sell|party|snort|inject)\b"
|
| 177 |
+
return bool(re.search(context_pattern, text, re.IGNORECASE))
|
| 178 |
+
return True
|
| 179 |
+
return False
|
| 180 |
+
|
| 181 |
+
def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
|
| 182 |
+
"""Compute keyword-based score for enhanced accuracy"""
|
| 183 |
+
text_lower = text.lower()
|
| 184 |
+
|
| 185 |
+
drug_matches = sum(1 for kw in DRUG_KEYWORDS if keyword_check_with_context(text_lower, kw))
|
| 186 |
+
high_risk_matches = sum(1 for kw in HIGH_RISK_KEYWORDS if keyword_check_with_context(text_lower, kw))
|
| 187 |
+
|
| 188 |
context_patterns = [
|
| 189 |
r'(?i)(pick.*up|got.*stuff|meet.*behind)',
|
| 190 |
r'(?i)(payment|crypto|cash.*deal)',
|
|
|
|
| 193 |
r'(?i)(quality.*good|pure.*stuff)',
|
| 194 |
r'(?i)(cops.*around|too.*risky)'
|
| 195 |
]
|
| 196 |
+
context_matches = sum(1 for pattern in context_patterns if re.search(pattern, text_lower))
|
| 197 |
+
|
|
|
|
|
|
|
| 198 |
keyword_score = 0.0
|
| 199 |
if high_risk_matches > 0:
|
| 200 |
keyword_score += min(high_risk_matches * 0.3, 0.8)
|
|
|
|
| 202 |
keyword_score += min(drug_matches * 0.1, 0.3)
|
| 203 |
if context_matches > 0:
|
| 204 |
keyword_score += min(context_matches * 0.15, 0.4)
|
| 205 |
+
|
| 206 |
keyword_score = min(keyword_score, 1.0)
|
| 207 |
+
|
| 208 |
return keyword_score, {
|
| 209 |
'drug_keywords': drug_matches,
|
| 210 |
'high_risk_keywords': high_risk_matches,
|
| 211 |
'context_patterns': context_matches
|
| 212 |
}
|
| 213 |
|
| 214 |
+
|
| 215 |
# =======================
|
| 216 |
# Config validation/fix with enhanced error handling
|
| 217 |
# =======================
|