bangunx commited on
Commit
02c591e
Β·
1 Parent(s): d5d3986

🎯 Enhanced Detection Accuracy & Verification System

Browse files

✨ Advanced Detection Features:
- Implemented confidence scoring system (0.0 to 1.0)
- Added multi-factor analysis for username detection
- Enhanced error message and URL pattern checking
- Improved response validation with detailed indicators

πŸ” Verification System:
- Automatic re-verification for low confidence results
- Multiple verification attempts with retry logic
- Confidence-based result filtering and validation
- Real-time verification status tracking

🎨 Enhanced UI Display:
- Confidence score indicators with color coding
- Verification status icons (βœ… ⚠️ ❌)
- Detailed confidence statistics summary
- Verification attempt counters
- Enhanced result tables with confidence info

πŸ”§ Technical Improvements:
- Advanced claimed status verification algorithm
- Multi-indicator analysis (profile, social, URL patterns)
- Error indicator detection with penalty system
- Social media specific pattern recognition
- Robust exception handling and fallback logic

πŸ“Š Confidence Scoring:
- High Confidence (β‰₯80%): Green indicators
- Medium Confidence (60-79%): Yellow indicators
- Low Confidence (<60%): Red indicators
- Automatic verification for low confidence results

🎯 Detection Accuracy:
- Reduced false positives through multi-factor analysis
- Enhanced error page detection
- Better URL pattern recognition
- Improved social media indicator detection
- More accurate status determination

πŸš€ User Benefits:
- More reliable username detection results
- Clear confidence indicators for each result
- Automatic verification of uncertain results
- Better understanding of result reliability
- Reduced false positives and negatives

src/sherlock/core/search_engine.py CHANGED
@@ -24,6 +24,9 @@ class SearchResult:
24
  status_code: Optional[int] = None
25
  error_message: Optional[str] = None
26
  is_nsfw: bool = False
 
 
 
27
 
28
 
29
  class OSINTSearchEngine:
@@ -104,7 +107,10 @@ class OSINTSearchEngine:
104
  elif isinstance(result, Exception):
105
  print(f"Search error: {result}")
106
 
107
- return valid_results
 
 
 
108
 
109
  async def _search_single_website(
110
  self,
@@ -125,8 +131,8 @@ class OSINTSearchEngine:
125
  async with self.session.get(url) as response:
126
  response_time = time.time() - start_time
127
 
128
- # Determine status based on error type
129
- status = self._determine_status(response, config)
130
 
131
  return SearchResult(
132
  website=website_name,
@@ -135,7 +141,10 @@ class OSINTSearchEngine:
135
  status=status,
136
  response_time=response_time,
137
  status_code=response.status,
138
- is_nsfw=config.is_nsfw
 
 
 
139
  )
140
 
141
  except asyncio.TimeoutError:
@@ -157,37 +166,261 @@ class OSINTSearchEngine:
157
  error_message=str(e)
158
  )
159
 
160
- def _determine_status(self, response: aiohttp.ClientResponse, config: WebsiteConfig) -> str:
161
- """Determine if username is claimed or available based on response."""
162
-
163
- if config.error_type == "status_code":
164
- # Username is claimed if status code is not 404
165
- return "claimed" if response.status != 404 else "available"
166
 
167
- elif config.error_type == "message":
168
- if not config.error_msg:
169
- return "claimed" if response.status == 200 else "available"
170
-
171
- # Check if error message is in response text
172
  try:
173
  response_text = response.text.lower()
174
- for error_msg in config.error_msg:
175
- if error_msg.lower() in response_text:
176
- return "available"
177
- return "claimed"
178
  except:
179
- return "claimed" if response.status == 200 else "available"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- elif config.error_type == "response_url":
182
- # Check if response URL indicates error
183
- response_url = str(response.url)
184
- if "error" in response_url.lower() or "404" in response_url:
185
- return "available"
186
- return "claimed"
 
 
 
 
 
 
187
 
188
- else:
189
- # Default behavior
190
- return "claimed" if response.status == 200 else "available"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  def get_available_websites(self) -> List[str]:
193
  """Get list of available website names."""
 
24
  status_code: Optional[int] = None
25
  error_message: Optional[str] = None
26
  is_nsfw: bool = False
27
+ confidence_score: float = 0.0 # 0.0 to 1.0, higher = more confident
28
+ verification_attempts: int = 1 # Number of verification attempts made
29
+ is_verified: bool = False # Whether result has been verified
30
 
31
 
32
  class OSINTSearchEngine:
 
107
  elif isinstance(result, Exception):
108
  print(f"Search error: {result}")
109
 
110
+ # Verify low-confidence results
111
+ verified_results = await self._verify_low_confidence_results(valid_results)
112
+
113
+ return verified_results
114
 
115
  async def _search_single_website(
116
  self,
 
131
  async with self.session.get(url) as response:
132
  response_time = time.time() - start_time
133
 
134
+ # Determine status based on error type with enhanced verification
135
+ status, confidence_score = self._determine_status_with_confidence(response, config)
136
 
137
  return SearchResult(
138
  website=website_name,
 
141
  status=status,
142
  response_time=response_time,
143
  status_code=response.status,
144
+ is_nsfw=config.is_nsfw,
145
+ confidence_score=confidence_score,
146
+ verification_attempts=1,
147
+ is_verified=confidence_score >= 0.8
148
  )
149
 
150
  except asyncio.TimeoutError:
 
166
  error_message=str(e)
167
  )
168
 
169
+ def _determine_status_with_confidence(self, response: aiohttp.ClientResponse, config: WebsiteConfig) -> Tuple[str, float]:
170
+ """Determine if username is claimed or available based on response with enhanced accuracy."""
 
 
 
 
171
 
172
+ try:
173
+ # Get response text for analysis
174
+ response_text = ""
 
 
175
  try:
176
  response_text = response.text.lower()
 
 
 
 
177
  except:
178
+ pass
179
+
180
+ # Get response URL
181
+ response_url = str(response.url).lower()
182
+
183
+ if config.error_type == "status_code":
184
+ # Enhanced status code checking
185
+ if response.status == 404:
186
+ return "available", 0.95 # High confidence for 404
187
+ elif response.status == 200:
188
+ # Additional verification for 200 status
189
+ confidence = self._calculate_claimed_confidence(response_text, response_url, config)
190
+ if confidence >= 0.6:
191
+ return "claimed", confidence
192
+ else:
193
+ return "available", 0.7 # Medium confidence for unclear 200
194
+ elif response.status in [301, 302, 303, 307, 308]:
195
+ # Redirect might indicate claimed username
196
+ return "claimed", 0.8 # High confidence for redirects
197
+ else:
198
+ return "error", 0.3 # Low confidence for other status codes
199
+
200
+ elif config.error_type == "message":
201
+ if not config.error_msg:
202
+ # Fallback to status code
203
+ confidence = 0.6 if response.status == 200 else 0.7
204
+ return ("claimed" if response.status == 200 else "available", confidence)
205
+
206
+ # Enhanced message checking
207
+ if self._check_error_messages(response_text, config.error_msg):
208
+ return "available", 0.9 # High confidence for clear error messages
209
+ elif response.status == 200:
210
+ # Verify if it's actually claimed
211
+ confidence = self._calculate_claimed_confidence(response_text, response_url, config)
212
+ if confidence >= 0.6:
213
+ return "claimed", confidence
214
+ else:
215
+ return "available", 0.7
216
+ else:
217
+ confidence = 0.8 if response.status not in [404, 403] else 0.6
218
+ return ("claimed" if response.status not in [404, 403] else "available", confidence)
219
+
220
+ elif config.error_type == "response_url":
221
+ # Enhanced URL checking
222
+ if self._check_error_url(response_url):
223
+ return "available", 0.9 # High confidence for error URLs
224
+ elif response.status == 200:
225
+ return "claimed", 0.8 # High confidence for 200 with good URL
226
+ else:
227
+ return "error", 0.3
228
+
229
+ else:
230
+ # Enhanced default behavior
231
+ if response.status == 200:
232
+ confidence = self._calculate_claimed_confidence(response_text, response_url, config)
233
+ if confidence >= 0.6:
234
+ return "claimed", confidence
235
+ else:
236
+ return "available", 0.7
237
+ elif response.status == 404:
238
+ return "available", 0.95
239
+ else:
240
+ return "error", 0.3
241
+
242
+ except Exception as e:
243
+ # Fallback to basic status code check
244
+ confidence = 0.5 # Low confidence for exceptions
245
+ return ("claimed" if response.status == 200 else "available", confidence)
246
+
247
+ def _check_error_messages(self, response_text: str, error_messages: List[str]) -> bool:
248
+ """Check if any error messages are present in response text."""
249
+ if not response_text or not error_messages:
250
+ return False
251
 
252
+ for error_msg in error_messages:
253
+ if error_msg.lower() in response_text:
254
+ return True
255
+ return False
256
+
257
+ def _check_error_url(self, response_url: str) -> bool:
258
+ """Check if response URL indicates an error."""
259
+ error_indicators = [
260
+ "error", "404", "not-found", "notfound", "page-not-found",
261
+ "user-not-found", "profile-not-found", "account-not-found",
262
+ "invalid", "not-available", "unavailable"
263
+ ]
264
 
265
+ for indicator in error_indicators:
266
+ if indicator in response_url:
267
+ return True
268
+ return False
269
+
270
+ def _calculate_claimed_confidence(self, response_text: str, response_url: str, config: WebsiteConfig) -> float:
271
+ """Calculate confidence score for claimed status with detailed analysis."""
272
+
273
+ confidence = 0.0
274
+
275
+ # Check for common indicators of claimed profiles
276
+ claimed_indicators = [
277
+ "profile", "user", "member", "account", "dashboard",
278
+ "settings", "edit", "follow", "followers", "following",
279
+ "posts", "activity", "timeline", "feed", "bio", "avatar",
280
+ "username", "display name", "join date", "last seen",
281
+ "verified", "badge", "trophy", "achievement"
282
+ ]
283
+
284
+ # Check for common indicators of error pages
285
+ error_indicators = [
286
+ "page not found", "user not found", "profile not found",
287
+ "account not found", "invalid user", "user does not exist",
288
+ "no such user", "user not available", "profile unavailable",
289
+ "error 404", "not found", "does not exist", "access denied",
290
+ "forbidden", "unauthorized", "blocked", "suspended"
291
+ ]
292
+
293
+ # Check for error indicators first (high penalty)
294
+ error_count = 0
295
+ for error_indicator in error_indicators:
296
+ if error_indicator in response_text:
297
+ error_count += 1
298
+
299
+ if error_count > 0:
300
+ confidence -= error_count * 0.3 # Heavy penalty for error indicators
301
+
302
+ # Check for claimed indicators (positive score)
303
+ claimed_count = 0
304
+ for claimed_indicator in claimed_indicators:
305
+ if claimed_indicator in response_text:
306
+ claimed_count += 1
307
+
308
+ # Add confidence based on claimed indicators
309
+ if claimed_count > 0:
310
+ confidence += min(claimed_count * 0.15, 0.6) # Cap at 0.6
311
+
312
+ # Check URL patterns (strong indicator)
313
+ url_patterns = ["/user/", "/profile/", "/@", "/u/", "/member/", "/account/"]
314
+ url_matches = sum(1 for pattern in url_patterns if pattern in response_url)
315
+ if url_matches > 0:
316
+ confidence += min(url_matches * 0.2, 0.4) # Cap at 0.4
317
+
318
+ # Check for social media specific indicators
319
+ social_indicators = [
320
+ "follow", "unfollow", "subscribe", "like", "share", "comment",
321
+ "post", "tweet", "status", "update", "story", "reel"
322
+ ]
323
+ social_count = sum(1 for indicator in social_indicators if indicator in response_text)
324
+ if social_count > 0:
325
+ confidence += min(social_count * 0.1, 0.2) # Cap at 0.2
326
+
327
+ # Ensure confidence is between 0 and 1
328
+ confidence = max(0.0, min(1.0, confidence))
329
+
330
+ # If no clear indicators, return medium confidence
331
+ if confidence == 0.0:
332
+ confidence = 0.5
333
+
334
+ return confidence
335
+
336
+ async def _verify_low_confidence_results(self, results: List[SearchResult]) -> List[SearchResult]:
337
+ """Verify results with low confidence scores by re-checking them."""
338
+
339
+ verified_results = []
340
+ low_confidence_results = []
341
+
342
+ # Separate high and low confidence results
343
+ for result in results:
344
+ if result.confidence_score >= 0.7:
345
+ verified_results.append(result)
346
+ else:
347
+ low_confidence_results.append(result)
348
+
349
+ # Re-verify low confidence results
350
+ if low_confidence_results:
351
+ verification_tasks = []
352
+ for result in low_confidence_results:
353
+ config = self.website_manager.get_website(result.website)
354
+ if config:
355
+ verification_tasks.append(
356
+ self._verify_single_result(result, config)
357
+ )
358
+
359
+ if verification_tasks:
360
+ verification_results = await asyncio.gather(*verification_tasks, return_exceptions=True)
361
+
362
+ for verification_result in verification_results:
363
+ if isinstance(verification_result, SearchResult):
364
+ verified_results.append(verification_result)
365
+ elif isinstance(verification_result, Exception):
366
+ print(f"Verification error: {verification_result}")
367
+
368
+ return verified_results
369
+
370
+ async def _verify_single_result(self, original_result: SearchResult, config: WebsiteConfig) -> SearchResult:
371
+ """Verify a single result by making another request."""
372
+
373
+ try:
374
+ # Make a second request for verification
375
+ url = config.url.format(original_result.username)
376
+
377
+ async with self.session.get(url) as response:
378
+ response_time = time.time()
379
+
380
+ # Get response text for analysis
381
+ response_text = ""
382
+ try:
383
+ response_text = response.text.lower()
384
+ except:
385
+ pass
386
+
387
+ response_url = str(response.url).lower()
388
+
389
+ # Re-calculate confidence with more detailed analysis
390
+ status, confidence = self._determine_status_with_confidence(response, config)
391
+
392
+ # Update verification attempts
393
+ verification_attempts = original_result.verification_attempts + 1
394
+
395
+ return SearchResult(
396
+ website=original_result.website,
397
+ username=original_result.username,
398
+ url=original_result.url,
399
+ status=status,
400
+ response_time=response_time,
401
+ status_code=response.status,
402
+ error_message=original_result.error_message,
403
+ is_nsfw=original_result.is_nsfw,
404
+ confidence_score=confidence,
405
+ verification_attempts=verification_attempts,
406
+ is_verified=confidence >= 0.8
407
+ )
408
+
409
+ except Exception as e:
410
+ # Return original result if verification fails
411
+ return SearchResult(
412
+ website=original_result.website,
413
+ username=original_result.username,
414
+ url=original_result.url,
415
+ status=original_result.status,
416
+ response_time=original_result.response_time,
417
+ status_code=original_result.status_code,
418
+ error_message=f"Verification failed: {str(e)}",
419
+ is_nsfw=original_result.is_nsfw,
420
+ confidence_score=original_result.confidence_score,
421
+ verification_attempts=original_result.verification_attempts + 1,
422
+ is_verified=False
423
+ )
424
 
425
  def get_available_websites(self) -> List[str]:
426
  """Get list of available website names."""
src/sherlock/web/gradio_interface.py CHANGED
@@ -90,9 +90,15 @@ class SherlockGradioInterface:
90
  claimed_results = [r for r in results if r.status == "claimed"]
91
  error_results = [r for r in results if r.status == "error"]
92
 
 
 
 
 
 
 
93
  html = """
94
  <div style="margin: 20px 0;">
95
- <div style="display: flex; gap: 20px; margin-bottom: 20px; flex-wrap: wrap;">
96
  <div style="background: #1a472a; color: #3fb950; padding: 10px 15px; border-radius: 8px; font-weight: bold; border: 1px solid #30363d;">
97
  βœ… Available: {available_count}
98
  </div>
@@ -103,10 +109,28 @@ class SherlockGradioInterface:
103
  ⚠️ Error: {error_count}
104
  </div>
105
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  """.format(
107
  available_count=len(available_results),
108
  claimed_count=len(claimed_results),
109
- error_count=len(error_results)
 
 
 
 
110
  )
111
 
112
  # Create organized tables for each status
@@ -146,10 +170,18 @@ class SherlockGradioInterface:
146
  else:
147
  action_button = f'<span style="background: #6e7681; color: white; padding: 6px 12px; border-radius: 4px; font-size: 12px;">N/A</span>'
148
 
 
 
 
 
 
149
  html += f"""
150
  <tr style="border-bottom: 1px solid #30363d;">
151
  <td style="padding: 12px; border-bottom: 1px solid #30363d; font-weight: bold; color: #f0f6fc;">
152
  {result.website}
 
 
 
153
  </td>
154
  <td style="padding: 12px; border-bottom: 1px solid #30363d;">
155
  <a href="{result.url}" target="_blank" style="color: #58a6ff; text-decoration: none; word-break: break-all;">
@@ -158,6 +190,9 @@ class SherlockGradioInterface:
158
  </td>
159
  <td style="padding: 12px; border-bottom: 1px solid #30363d; text-align: center; color: #8b949e;">
160
  {result.response_time:.2f}s
 
 
 
161
  </td>
162
  <td style="padding: 12px; border-bottom: 1px solid #30363d; text-align: center;">
163
  {action_button}
 
90
  claimed_results = [r for r in results if r.status == "claimed"]
91
  error_results = [r for r in results if r.status == "error"]
92
 
93
+ # Calculate confidence statistics
94
+ high_confidence = len([r for r in results if r.confidence_score >= 0.8])
95
+ medium_confidence = len([r for r in results if 0.6 <= r.confidence_score < 0.8])
96
+ low_confidence = len([r for r in results if r.confidence_score < 0.6])
97
+ verified_count = len([r for r in results if r.is_verified])
98
+
99
  html = """
100
  <div style="margin: 20px 0;">
101
+ <div style="display: flex; gap: 15px; margin-bottom: 20px; flex-wrap: wrap;">
102
  <div style="background: #1a472a; color: #3fb950; padding: 10px 15px; border-radius: 8px; font-weight: bold; border: 1px solid #30363d;">
103
  βœ… Available: {available_count}
104
  </div>
 
109
  ⚠️ Error: {error_count}
110
  </div>
111
  </div>
112
+ <div style="display: flex; gap: 15px; margin-bottom: 20px; flex-wrap: wrap;">
113
+ <div style="background: #1a3a2a; color: #3fb950; padding: 8px 12px; border-radius: 6px; font-weight: bold; border: 1px solid #30363d; font-size: 14px;">
114
+ 🎯 High Confidence: {high_confidence}
115
+ </div>
116
+ <div style="background: #3a2a1a; color: #d29922; padding: 8px 12px; border-radius: 6px; font-weight: bold; border: 1px solid #30363d; font-size: 14px;">
117
+ ⚠️ Medium Confidence: {medium_confidence}
118
+ </div>
119
+ <div style="background: #4a1a1a; color: #f85149; padding: 8px 12px; border-radius: 6px; font-weight: bold; border: 1px solid #30363d; font-size: 14px;">
120
+ ❌ Low Confidence: {low_confidence}
121
+ </div>
122
+ <div style="background: #1a2a3a; color: #58a6ff; padding: 8px 12px; border-radius: 6px; font-weight: bold; border: 1px solid #30363d; font-size: 14px;">
123
+ βœ… Verified: {verified_count}
124
+ </div>
125
+ </div>
126
  """.format(
127
  available_count=len(available_results),
128
  claimed_count=len(claimed_results),
129
+ error_count=len(error_results),
130
+ high_confidence=high_confidence,
131
+ medium_confidence=medium_confidence,
132
+ low_confidence=low_confidence,
133
+ verified_count=verified_count
134
  )
135
 
136
  # Create organized tables for each status
 
170
  else:
171
  action_button = f'<span style="background: #6e7681; color: white; padding: 6px 12px; border-radius: 4px; font-size: 12px;">N/A</span>'
172
 
173
+ # Confidence indicator
174
+ confidence_color = "#3fb950" if result.confidence_score >= 0.8 else "#d29922" if result.confidence_score >= 0.6 else "#f85149"
175
+ confidence_text = f"{result.confidence_score:.1%}"
176
+ verification_icon = "βœ…" if result.is_verified else "⚠️" if result.confidence_score >= 0.6 else "❌"
177
+
178
  html += f"""
179
  <tr style="border-bottom: 1px solid #30363d;">
180
  <td style="padding: 12px; border-bottom: 1px solid #30363d; font-weight: bold; color: #f0f6fc;">
181
  {result.website}
182
+ <br><small style="color: {confidence_color};">
183
+ {verification_icon} {confidence_text} confidence
184
+ </small>
185
  </td>
186
  <td style="padding: 12px; border-bottom: 1px solid #30363d;">
187
  <a href="{result.url}" target="_blank" style="color: #58a6ff; text-decoration: none; word-break: break-all;">
 
190
  </td>
191
  <td style="padding: 12px; border-bottom: 1px solid #30363d; text-align: center; color: #8b949e;">
192
  {result.response_time:.2f}s
193
+ <br><small style="color: #6e7681;">
194
+ {result.verification_attempts} attempt{'s' if result.verification_attempts > 1 else ''}
195
+ </small>
196
  </td>
197
  <td style="padding: 12px; border-bottom: 1px solid #30363d; text-align: center;">
198
  {action_button}