lawlevisan commited on
Commit
5c7f2f8
·
verified ·
1 Parent(s): 1a1611d

Update src/predict.py

Browse files
Files changed (1) hide show
  1. src/predict.py +65 -10
src/predict.py CHANGED
@@ -123,12 +123,68 @@ def preprocess_text(text: str) -> str:
123
  def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
124
  """Compute keyword-based score for enhanced accuracy"""
125
  text_lower = text.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # Count different types of keywords
128
- drug_matches = sum(1 for kw in DRUG_KEYWORDS if kw in text_lower)
129
- high_risk_matches = sum(1 for kw in HIGH_RISK_KEYWORDS if kw in text_lower)
130
 
131
- # Context patterns for better detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  context_patterns = [
133
  r'(?i)(pick.*up|got.*stuff|meet.*behind)',
134
  r'(?i)(payment|crypto|cash.*deal)',
@@ -137,10 +193,8 @@ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
137
  r'(?i)(quality.*good|pure.*stuff)',
138
  r'(?i)(cops.*around|too.*risky)'
139
  ]
140
-
141
- context_matches = sum(1 for pattern in context_patterns if re.search(pattern, text))
142
-
143
- # Enhanced scoring with weights
144
  keyword_score = 0.0
145
  if high_risk_matches > 0:
146
  keyword_score += min(high_risk_matches * 0.3, 0.8)
@@ -148,15 +202,16 @@ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
148
  keyword_score += min(drug_matches * 0.1, 0.3)
149
  if context_matches > 0:
150
  keyword_score += min(context_matches * 0.15, 0.4)
151
-
152
  keyword_score = min(keyword_score, 1.0)
153
-
154
  return keyword_score, {
155
  'drug_keywords': drug_matches,
156
  'high_risk_keywords': high_risk_matches,
157
  'context_patterns': context_matches
158
  }
159
 
 
160
  # =======================
161
  # Config validation/fix with enhanced error handling
162
  # =======================
 
123
  def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
124
  """Compute keyword-based score for enhanced accuracy"""
125
  text_lower = text.lower()
126
+
127
+ # =======================
128
+ # Enhanced text preprocessing for better accuracy
129
+ # =======================
130
+ def preprocess_text(text: str) -> str:
131
+ """Enhanced text preprocessing for better model accuracy"""
132
+ if not text:
133
+ return ""
134
+
135
+ # Convert to lowercase
136
+ text = text.lower()
137
+
138
+ # Remove excessive whitespace but preserve sentence structure
139
+ text = re.sub(r'\s+', ' ', text)
140
+
141
+ # Handle common abbreviations and slang normalization
142
+ abbreviations = {
143
+ 'u': 'you',
144
+ 'ur': 'your',
145
+ 'n': 'and',
146
+ 'w/': 'with',
147
+ 'thru': 'through',
148
+ 'gonna': 'going to',
149
+ 'wanna': 'want to',
150
+ 'gotta': 'got to'
151
+ }
152
 
153
+ for abbrev, full in abbreviations.items():
154
+ text = re.sub(rf'\b{re.escape(abbrev)}\b', full, text)
 
155
 
156
+ # Remove excessive punctuation but keep sentence boundaries
157
+ text = re.sub(r'[!]{2,}', '!', text)
158
+ text = re.sub(r'[?]{2,}', '?', text)
159
+ text = re.sub(r'[.]{3,}', '...', text)
160
+
161
+ return text.strip()
162
+
163
+ # =======================
164
+ # Enhanced keyword-based scoring
165
+ # =======================
166
+ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
167
+ """Compute keyword-based score for enhanced accuracy"""
168
+ text_lower = text.lower()
169
+
170
+ AMBIGUOUS_TERMS = {"e", "x", "line", "ice", "horse", "420"}
171
+
172
+ def keyword_check_with_context(text: str, kw: str) -> bool:
173
+ pattern = rf"\b{re.escape(kw)}\b"
174
+ if re.search(pattern, text, re.IGNORECASE):
175
+ if kw in AMBIGUOUS_TERMS:
176
+ context_pattern = r"\b(smoke|roll|pop|hit|take|buy|sell|party|snort|inject)\b"
177
+ return bool(re.search(context_pattern, text, re.IGNORECASE))
178
+ return True
179
+ return False
180
+
181
+ def compute_keyword_score(text: str) -> Tuple[float, Dict[str, int]]:
182
+ """Compute keyword-based score for enhanced accuracy"""
183
+ text_lower = text.lower()
184
+
185
+ drug_matches = sum(1 for kw in DRUG_KEYWORDS if keyword_check_with_context(text_lower, kw))
186
+ high_risk_matches = sum(1 for kw in HIGH_RISK_KEYWORDS if keyword_check_with_context(text_lower, kw))
187
+
188
  context_patterns = [
189
  r'(?i)(pick.*up|got.*stuff|meet.*behind)',
190
  r'(?i)(payment|crypto|cash.*deal)',
 
193
  r'(?i)(quality.*good|pure.*stuff)',
194
  r'(?i)(cops.*around|too.*risky)'
195
  ]
196
+ context_matches = sum(1 for pattern in context_patterns if re.search(pattern, text_lower))
197
+
 
 
198
  keyword_score = 0.0
199
  if high_risk_matches > 0:
200
  keyword_score += min(high_risk_matches * 0.3, 0.8)
 
202
  keyword_score += min(drug_matches * 0.1, 0.3)
203
  if context_matches > 0:
204
  keyword_score += min(context_matches * 0.15, 0.4)
205
+
206
  keyword_score = min(keyword_score, 1.0)
207
+
208
  return keyword_score, {
209
  'drug_keywords': drug_matches,
210
  'high_risk_keywords': high_risk_matches,
211
  'context_patterns': context_matches
212
  }
213
 
214
+
215
  # =======================
216
  # Config validation/fix with enhanced error handling
217
  # =======================