rottg commited on
Commit
9fa91af
verified
1 Parent(s): c4decf6

Update code

Browse files
Files changed (3) hide show
  1. requirements.txt +2 -0
  2. stylometry.py +509 -217
  3. templates/maintenance.html +130 -15
requirements.txt CHANGED
@@ -6,3 +6,5 @@ huggingface_hub>=0.20
6
  rank_bm25>=0.2.2
7
  google-genai>=1.0.0
8
  sentence-transformers>=2.2.0
 
 
 
6
  rank_bm25>=0.2.2
7
  google-genai>=1.0.0
8
  sentence-transformers>=2.2.0
9
+ scikit-learn>=1.3.0
10
+ numpy>=1.24.0
stylometry.py CHANGED
@@ -1,17 +1,28 @@
1
  """
2
- Stylometry Analysis Module for Hebrew Text
3
  Detects potential duplicate accounts based on writing style patterns.
 
 
 
 
 
4
  """
5
 
6
  import re
7
  import sqlite3
8
  import math
 
 
9
  from collections import Counter, defaultdict
10
  from datetime import datetime, timedelta
11
- from typing import Dict, List, Tuple, Optional
12
- import json
 
 
 
 
13
 
14
- # Hebrew character range
15
  HEBREW_PATTERN = re.compile(r'[\u0590-\u05FF]')
16
  ENGLISH_PATTERN = re.compile(r'[a-zA-Z]')
17
  EMOJI_PATTERN = re.compile(
@@ -26,55 +37,108 @@ EMOJI_PATTERN = re.compile(
26
  flags=re.UNICODE
27
  )
28
 
29
- # Common Hebrew slang and expressions
30
- HEBREW_SLANG = ['讗讞诇讛', '住讘讘讛', '讬讗诇诇讛', '讜讜讗诇讛', '讘讗住讛', '讞讘诇', '诪讙谞讬讘', '讗砖讻专讛', '讞讞讞讞', '讞讞讞', '讛讛讛讛', '诪诪诪诪']
31
- HEBREW_ACRONYMS = ['讝讛砖', '讗讻讗', '诇讜诇', '讘讟讞', '', '转谞爪讘讛', '讝讗转']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
 
 
33
 
34
- class StyleFeatures:
35
- """Features extracted from a user's messages."""
 
36
 
37
  def __init__(self, user_id: int, user_name: str):
38
  self.user_id = user_id
39
  self.user_name = user_name
40
  self.message_count = 0
41
 
42
- # Length features
43
  self.avg_message_length = 0.0
44
- self.avg_word_length = 0.0
45
  self.std_message_length = 0.0
 
 
46
 
47
- # Character ratios
48
  self.hebrew_ratio = 0.0
49
  self.english_ratio = 0.0
50
  self.digit_ratio = 0.0
51
  self.emoji_ratio = 0.0
 
52
 
53
- # Punctuation patterns
54
  self.comma_rate = 0.0
55
  self.period_rate = 0.0
56
  self.question_rate = 0.0
57
  self.exclamation_rate = 0.0
58
- self.ellipsis_rate = 0.0 # ...
 
59
 
60
- # Special patterns
61
- self.caps_ratio = 0.0
62
- self.repeated_chars_rate = 0.0 # 讻谉谉谉谉谉
63
  self.slang_rate = 0.0
 
 
 
64
 
65
- # Time patterns (24 hours distribution)
66
- self.hour_distribution = [0.0] * 24
 
 
 
 
 
 
 
 
67
  self.weekend_ratio = 0.0
 
68
 
69
- # Word patterns
70
- self.unique_word_ratio = 0.0
71
- self.short_message_ratio = 0.0 # < 5 words
72
 
73
- # Top character bigrams (normalized)
74
  self.char_bigrams: Dict[str, float] = {}
 
 
75
 
76
- # Feature vector for similarity calculation
77
- self.feature_vector: List[float] = []
 
 
 
 
 
 
78
 
79
  def to_dict(self) -> dict:
80
  return {
@@ -85,23 +149,54 @@ class StyleFeatures:
85
  'avg_word_length': round(self.avg_word_length, 2),
86
  'hebrew_ratio': round(self.hebrew_ratio, 3),
87
  'english_ratio': round(self.english_ratio, 3),
88
- 'emoji_ratio': round(self.emoji_ratio, 3),
 
 
89
  'question_rate': round(self.question_rate, 3),
90
  'exclamation_rate': round(self.exclamation_rate, 3),
91
- 'ellipsis_rate': round(self.ellipsis_rate, 3),
92
  'repeated_chars_rate': round(self.repeated_chars_rate, 3),
93
  'weekend_ratio': round(self.weekend_ratio, 3),
 
94
  'unique_word_ratio': round(self.unique_word_ratio, 3),
95
  }
96
 
97
 
98
- class StylometryAnalyzer:
99
- """Analyzes writing styles to detect potential duplicate accounts."""
 
 
 
 
 
100
 
101
  def __init__(self, db_path: str = 'telegram_data.db'):
102
  self.db_path = db_path
103
- self.user_features: Dict[int, StyleFeatures] = {}
104
- self.similarity_threshold = 0.85 # Adjustable threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  def get_active_users(self, min_messages: int = 300, days: int = 365) -> List[Tuple[int, str, int]]:
107
  """Get users active in the last N days with at least min_messages."""
@@ -147,225 +242,333 @@ class StylometryAnalyzer:
147
 
148
  return messages
149
 
150
- def extract_features(self, user_id: int, user_name: str, messages: List[Tuple[str, str]]) -> StyleFeatures:
151
- """Extract stylometric features from user messages."""
152
- features = StyleFeatures(user_id, user_name)
 
153
  features.message_count = len(messages)
154
 
155
  if not messages:
156
  return features
157
 
158
- # Collect statistics
159
- message_lengths = []
160
- word_lengths = []
 
 
 
 
 
 
161
  all_words = []
162
- unique_words = set()
163
- short_messages = 0
164
-
165
- hebrew_chars = 0
166
- english_chars = 0
167
- digit_chars = 0
168
- total_chars = 0
169
- caps_chars = 0
170
-
171
- commas = 0
172
- periods = 0
173
- questions = 0
174
- exclamations = 0
175
- ellipsis = 0
176
-
177
- repeated_char_msgs = 0
178
- slang_count = 0
179
- emoji_count = 0
180
-
181
- hour_counts = [0] * 24
182
- weekend_msgs = 0
183
 
184
- char_bigram_counter = Counter()
 
 
 
185
 
186
- for text, date_str in messages:
187
- if not text:
188
- continue
 
 
 
 
 
189
 
190
- # Message length
191
- msg_len = len(text)
192
- message_lengths.append(msg_len)
193
- total_chars += msg_len
 
194
 
195
- # Word analysis
196
- words = text.split()
197
- if len(words) < 5:
198
- short_messages += 1
199
- for word in words:
200
- word_lengths.append(len(word))
201
- all_words.append(word.lower())
202
- unique_words.add(word.lower())
203
-
204
- # Character analysis
205
- hebrew_chars += len(HEBREW_PATTERN.findall(text))
206
- english_chars += len(ENGLISH_PATTERN.findall(text))
207
- digit_chars += sum(1 for c in text if c.isdigit())
208
- caps_chars += sum(1 for c in text if c.isupper())
209
-
210
- # Emoji analysis
211
- emojis = EMOJI_PATTERN.findall(text)
212
- emoji_count += len(emojis)
213
-
214
- # Punctuation
215
- commas += text.count(',')
216
- periods += text.count('.')
217
- questions += text.count('?')
218
- exclamations += text.count('!')
219
- ellipsis += text.count('...')
220
-
221
- # Repeated characters pattern (like 讻谉谉谉谉谉 or 讗讛讛讛讛讛)
222
- if re.search(r'(.)\1{3,}', text):
223
- repeated_char_msgs += 1
224
-
225
- # Slang detection
226
- text_lower = text.lower()
227
- for slang in HEBREW_SLANG:
228
- if slang in text:
229
- slang_count += 1
230
- break
231
-
232
- # Time analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  try:
234
  if 'T' in date_str:
235
  dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
236
  else:
237
  dt = datetime.strptime(date_str[:19], '%Y-%m-%d %H:%M:%S')
 
238
  hour_counts[dt.hour] += 1
 
 
 
 
239
  if dt.weekday() >= 5: # Saturday=5, Sunday=6
240
  weekend_msgs += 1
241
  except:
242
  pass
243
 
244
- # Character bigrams
245
- clean_text = re.sub(r'\s+', ' ', text.lower())
246
- for i in range(len(clean_text) - 1):
247
- bigram = clean_text[i:i+2]
248
- if bigram.strip():
249
- char_bigram_counter[bigram] += 1
250
-
251
- n_msgs = len(messages)
252
-
253
- # Calculate averages
254
- if message_lengths:
255
- features.avg_message_length = sum(message_lengths) / len(message_lengths)
256
- variance = sum((x - features.avg_message_length) ** 2 for x in message_lengths) / len(message_lengths)
257
- features.std_message_length = math.sqrt(variance)
258
 
259
- if word_lengths:
260
- features.avg_word_length = sum(word_lengths) / len(word_lengths)
261
-
262
- # Character ratios
263
- if total_chars > 0:
264
- features.hebrew_ratio = hebrew_chars / total_chars
265
- features.english_ratio = english_chars / total_chars
266
- features.digit_ratio = digit_chars / total_chars
267
- features.emoji_ratio = emoji_count / total_chars
268
- features.caps_ratio = caps_chars / max(1, english_chars)
269
 
270
- # Punctuation rates (per message)
271
- features.comma_rate = commas / n_msgs
272
- features.period_rate = periods / n_msgs
273
- features.question_rate = questions / n_msgs
274
- features.exclamation_rate = exclamations / n_msgs
275
- features.ellipsis_rate = ellipsis / n_msgs
276
-
277
- # Special patterns
278
- features.repeated_chars_rate = repeated_char_msgs / n_msgs
279
- features.slang_rate = slang_count / n_msgs
280
-
281
- # Time patterns
282
- total_hour_msgs = sum(hour_counts)
283
- if total_hour_msgs > 0:
284
- features.hour_distribution = [h / total_hour_msgs for h in hour_counts]
285
- features.weekend_ratio = weekend_msgs / n_msgs
286
-
287
- # Word patterns
288
- if all_words:
289
- features.unique_word_ratio = len(unique_words) / len(all_words)
290
- features.short_message_ratio = short_messages / n_msgs
291
 
292
- # Top character bigrams (normalized)
293
  total_bigrams = sum(char_bigram_counter.values())
294
  if total_bigrams > 0:
295
- top_bigrams = char_bigram_counter.most_common(50)
296
- features.char_bigrams = {bg: count / total_bigrams for bg, count in top_bigrams}
297
 
298
- # Build feature vector for similarity calculation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  features.feature_vector = self._build_feature_vector(features)
300
 
301
  return features
302
 
303
- def _build_feature_vector(self, f: StyleFeatures) -> List[float]:
304
  """Build normalized feature vector for similarity comparison."""
305
  vector = [
306
- f.avg_message_length / 100, # Normalize to ~1
 
 
307
  f.avg_word_length / 10,
 
 
 
308
  f.hebrew_ratio,
309
  f.english_ratio,
310
- f.emoji_ratio * 10, # Scale up small values
 
 
 
 
 
 
311
  f.question_rate,
312
  f.exclamation_rate,
313
  f.ellipsis_rate * 5,
314
- f.repeated_chars_rate * 10,
315
- f.weekend_ratio,
 
 
 
 
 
 
 
 
316
  f.unique_word_ratio,
 
317
  f.short_message_ratio,
318
- f.caps_ratio,
319
- f.slang_rate,
320
- f.comma_rate,
321
- f.period_rate,
 
322
  ]
323
 
324
  # Add hour distribution (24 values)
325
- vector.extend(f.hour_distribution)
326
-
327
- return vector
328
 
329
- def calculate_similarity(self, f1: StyleFeatures, f2: StyleFeatures) -> float:
330
- """Calculate cosine similarity between two feature vectors."""
331
- v1 = f1.feature_vector
332
- v2 = f2.feature_vector
333
 
334
- if not v1 or not v2 or len(v1) != len(v2):
335
- return 0.0
 
336
 
337
- # Cosine similarity
338
- dot_product = sum(a * b for a, b in zip(v1, v2))
339
- norm1 = math.sqrt(sum(a * a for a in v1))
340
- norm2 = math.sqrt(sum(b * b for b in v2))
341
 
342
- if norm1 == 0 or norm2 == 0:
343
- return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
- cosine_sim = dot_product / (norm1 * norm2)
 
 
 
 
346
 
347
- # Also compare character bigrams (Jaccard-like)
348
- bigram_sim = self._compare_bigrams(f1.char_bigrams, f2.char_bigrams)
 
 
349
 
350
- # Weighted combination
351
- return 0.7 * cosine_sim + 0.3 * bigram_sim
352
 
353
- def _compare_bigrams(self, bg1: Dict[str, float], bg2: Dict[str, float]) -> float:
354
- """Compare character bigram distributions."""
355
- if not bg1 or not bg2:
356
  return 0.0
357
 
358
- all_bigrams = set(bg1.keys()) | set(bg2.keys())
359
- if not all_bigrams:
360
  return 0.0
361
 
362
- # Calculate similarity based on shared bigrams
363
  intersection = 0.0
364
  union = 0.0
365
 
366
- for bg in all_bigrams:
367
- v1 = bg1.get(bg, 0)
368
- v2 = bg2.get(bg, 0)
369
  intersection += min(v1, v2)
370
  union += max(v1, v2)
371
 
@@ -374,8 +577,59 @@ class StylometryAnalyzer:
374
 
375
  return intersection / union
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  def analyze_all_users(self, min_messages: int = 300, days: int = 365,
378
- progress_callback=None) -> Dict:
379
  """Analyze all active users and find potential duplicates."""
380
 
381
  # Get active users
@@ -393,7 +647,7 @@ class StylometryAnalyzer:
393
  self.user_features[user_id] = features
394
 
395
  if progress_callback:
396
- progress_callback('user_processed', idx + 1, total_users, user_name)
397
 
398
  # Find similar pairs
399
  if progress_callback:
@@ -409,14 +663,16 @@ class StylometryAnalyzer:
409
  uid1, uid2 = user_ids[i], user_ids[j]
410
  f1, f2 = self.user_features[uid1], self.user_features[uid2]
411
 
412
- similarity = self.calculate_similarity(f1, f2)
413
 
414
  if similarity >= self.similarity_threshold:
415
  similar_pairs.append({
416
  'user1': f1.to_dict(),
417
  'user2': f2.to_dict(),
418
  'similarity': round(similarity * 100, 1),
419
- 'details': self._get_similarity_details(f1, f2)
 
 
420
  })
421
 
422
  comparison_count += 1
@@ -426,62 +682,98 @@ class StylometryAnalyzer:
426
  # Sort by similarity (highest first)
427
  similar_pairs.sort(key=lambda x: x['similarity'], reverse=True)
428
 
 
 
 
 
 
 
 
 
 
 
429
  return {
430
  'total_users_analyzed': total_users,
431
  'threshold': self.similarity_threshold * 100,
432
  'potential_duplicates': len(similar_pairs),
433
  'pairs': similar_pairs,
434
- 'all_users': [f.to_dict() for f in self.user_features.values()]
 
 
435
  }
436
 
437
- def _get_similarity_details(self, f1: StyleFeatures, f2: StyleFeatures) -> List[str]:
438
- """Get human-readable similarity details."""
 
439
  details = []
440
 
441
- # Message length similarity
 
 
 
 
442
  len_diff = abs(f1.avg_message_length - f2.avg_message_length)
443
- if len_diff < 10:
444
  details.append(f"讗讜专讱 讛讜讚注讛 讚讜诪讛 ({f1.avg_message_length:.0f} vs {f2.avg_message_length:.0f})")
445
 
446
  # Hebrew/English ratio
447
  heb_diff = abs(f1.hebrew_ratio - f2.hebrew_ratio)
448
  if heb_diff < 0.1:
449
- details.append(f"讬讞住 注讘专讬转/讗谞讙诇讬转 讚讜诪讛 ({f1.hebrew_ratio:.0%} vs {f2.hebrew_ratio:.0%})")
450
 
451
  # Emoji usage
452
  emoji_diff = abs(f1.emoji_ratio - f2.emoji_ratio)
453
- if emoji_diff < 0.01:
454
  details.append("砖讬诪讜砖 讚讜诪讛 讘讗讬诪讜讙'讬")
455
 
456
- # Question marks
457
- q_diff = abs(f1.question_rate - f2.question_rate)
458
- if q_diff < 0.1:
459
- details.append("砖讬诪讜砖 讚讜诪讛 讘住讬诪谞讬 砖讗诇讛")
 
 
 
460
 
461
- # Weekend activity
462
- weekend_diff = abs(f1.weekend_ratio - f2.weekend_ratio)
463
- if weekend_diff < 0.1:
464
- details.append("驻注 讚讜诪讛 讘住讜驻\"砖")
465
 
466
  # Repeated characters
467
  if abs(f1.repeated_chars_rate - f2.repeated_chars_rate) < 0.05:
468
  if f1.repeated_chars_rate > 0.1:
469
- details.append("砖谞讬讛诐 诪砖转诪砖讬诐 讘转讜讜讬诐 讞讜讝专讬诐 (讻诪讜 讻谉谉谉谉谉)")
470
 
471
  # Time patterns
472
- hour_sim = sum(min(h1, h2) for h1, h2 in zip(f1.hour_distribution, f2.hour_distribution))
473
- if hour_sim > 0.7:
474
- details.append("讚驻讜住 砖注讜转 驻注讬诇讜转 讚讜诪讛")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
  return details
477
 
478
 
479
  # Singleton instance
480
- _analyzer_instance: Optional[StylometryAnalyzer] = None
481
 
482
- def get_stylometry_analyzer() -> StylometryAnalyzer:
483
  """Get or create the stylometry analyzer singleton."""
484
  global _analyzer_instance
485
  if _analyzer_instance is None:
486
- _analyzer_instance = StylometryAnalyzer()
487
  return _analyzer_instance
 
1
  """
2
+ Advanced Stylometry Analysis Module for Hebrew Text
3
  Detects potential duplicate accounts based on writing style patterns.
4
+
5
+ Uses:
6
+ - sentence-transformers for Hebrew embeddings (writing style fingerprint)
7
+ - scikit-learn for DBSCAN clustering + TF-IDF on function words
8
+ - Hebrew-specific linguistic features (gender, formality, slang)
9
  """
10
 
11
  import re
12
  import sqlite3
13
  import math
14
+ import pickle
15
+ import os
16
  from collections import Counter, defaultdict
17
  from datetime import datetime, timedelta
18
+ from typing import Dict, List, Tuple, Optional, Set
19
+ import numpy as np
20
+
21
+ # ==========================================
22
+ # HEBREW LINGUISTIC PATTERNS
23
+ # ==========================================
24
 
25
+ # Hebrew character ranges
26
  HEBREW_PATTERN = re.compile(r'[\u0590-\u05FF]')
27
  ENGLISH_PATTERN = re.compile(r'[a-zA-Z]')
28
  EMOJI_PATTERN = re.compile(
 
37
  flags=re.UNICODE
38
  )
39
 
40
+ # Hebrew function words (high frequency, style indicators)
41
+ HEBREW_FUNCTION_WORDS = [
42
+ '砖诇', '讗转', '注', '注诐', '诇', '诪谉', '讬谉', '诇', '讗讞专讬', '转讞转',
43
+ '讗谞讬', '讗转讛', '讗转', '讛讜讗', '讛讬讗', '讗谞讞谞讜', '讗转诐', '讗转谉', '讛诐', '讛谉',
44
+ '讝讛', '讝讗转', '讝讜', '讗诇讛', '讗诇讜',
45
+ '讻讬', '讗诐', '讗讜', '讙诐', '专拽', '讗讘诇', '讗诇讗', '诇诪专讜转', '讘讙诇诇', '讻讚讬',
46
+ '诪讛', '诪讬', '讗讬驻讛', '诪转讬', '诇诪讛', '讗讬讱', '讻诪讛',
47
+ '讻诇', '讛专讘讛', '拽爪转', '诪讗讜讚', '讬讜转专', '驻讞讜转', '讻诪讜',
48
+ '诇讗', '讻谉', '讗讬谉', '讬砖', '讛讬讛', '诇讛讬讜转', '注讜讚', '讻讘专',
49
+ ]
50
+
51
+ # Formal vs informal markers
52
+ FORMAL_MARKERS = ['讗谞讜讻讬', '讛谞谞讬', '注诇讬讻诐', '讘讘拽砖讛', '转讜讚讛 专讘讛', '讘讻讘讜讚 专讘', '诇讻讘讜讚']
53
+ INFORMAL_MARKERS = ['讗讞讬', '讙讘专', '讗讞诇讛', '住讘讘讛', '讬讗诇诇讛', '讜讜讗诇讛', '讘讗住讛', '讞讞讞讞', '讞讞讞', '诇讜诇', 'wtf', 'omg']
54
+
55
+ # Hebrew slang and expressions
56
+ HEBREW_SLANG = [
57
+ '讗讞诇讛', '住讘讘讛', '讬讗诇诇讛', '讜讜讗诇讛', '讘讗住讛', '讞讘诇', '诪讙谞讬讘', '讗砖讻专讛',
58
+ '讞讞讞讞', '讞讞讞', '讛讛讛讛', '诪诪诪诪', '讗讛讛讛', '谞讜', '讟讜讘', '讘住讚专',
59
+ '驻讬爪讜抓', '诪砖讛讜', '讻讗讬诇讜', '住转诐', '诪诪砖', '驻砖讜讟', '谞讜专讗', '诪诇讗',
60
+ ]
61
+
62
+ # Hebrew acronyms
63
+ HEBREW_ACRONYMS = ['讘注讝讛砖', '讗讻讗', '谞诇注谞讚', '转谞爪讘讛', '讝爪诇', '讘住"讚', '讘注"讛', '讗讬"讛', '讘诇"谞']
64
+
65
+ # Gender markers in verbs (past tense patterns)
66
+ MALE_VERB_ENDINGS = ['转讬', '转', '谞讜', '转诐'] # 讛诇讻转讬, 讛诇讻转, 讛诇讻谞讜
67
+ FEMALE_VERB_ENDINGS = ['转讬', '转', '谞讜', '转谉'] # 讛诇讻转讬, 讛诇讻转 (female), 讛诇讻谞讜
68
+
69
+ # Repeated character pattern (emotional expression)
70
+ REPEATED_CHARS_PATTERN = re.compile(r'(.)\1{2,}')
71
 
72
+ # Word with numbers pattern (l33t speak)
73
+ LEET_PATTERN = re.compile(r'\b\w*\d+\w*\b')
74
 
75
+
76
+ class AdvancedStyleFeatures:
77
+ """Enhanced features extracted from a user's messages."""
78
 
79
  def __init__(self, user_id: int, user_name: str):
80
  self.user_id = user_id
81
  self.user_name = user_name
82
  self.message_count = 0
83
 
84
+ # === Basic Statistics ===
85
  self.avg_message_length = 0.0
 
86
  self.std_message_length = 0.0
87
+ self.avg_word_length = 0.0
88
+ self.avg_words_per_message = 0.0
89
 
90
+ # === Character Ratios ===
91
  self.hebrew_ratio = 0.0
92
  self.english_ratio = 0.0
93
  self.digit_ratio = 0.0
94
  self.emoji_ratio = 0.0
95
+ self.punctuation_ratio = 0.0
96
 
97
+ # === Punctuation Patterns ===
98
  self.comma_rate = 0.0
99
  self.period_rate = 0.0
100
  self.question_rate = 0.0
101
  self.exclamation_rate = 0.0
102
+ self.ellipsis_rate = 0.0
103
+ self.quote_rate = 0.0
104
 
105
+ # === Hebrew-Specific Features ===
106
+ self.formality_score = 0.0 # -1 (informal) to +1 (formal)
 
107
  self.slang_rate = 0.0
108
+ self.acronym_rate = 0.0
109
+ self.repeated_chars_rate = 0.0
110
+ self.leet_speak_rate = 0.0
111
 
112
+ # === Linguistic Patterns ===
113
+ self.function_word_freq: Dict[str, float] = {}
114
+ self.unique_word_ratio = 0.0
115
+ self.hapax_ratio = 0.0 # Words used only once
116
+ self.short_message_ratio = 0.0
117
+ self.long_message_ratio = 0.0
118
+
119
+ # === Time Patterns ===
120
+ self.hour_distribution = np.zeros(24)
121
+ self.weekday_distribution = np.zeros(7)
122
  self.weekend_ratio = 0.0
123
+ self.night_owl_ratio = 0.0 # Messages between 00:00-06:00
124
 
125
+ # === Response Patterns ===
126
+ self.reply_rate = 0.0
127
+ self.avg_response_words = 0.0
128
 
129
+ # === N-gram Features ===
130
  self.char_bigrams: Dict[str, float] = {}
131
+ self.char_trigrams: Dict[str, float] = {}
132
+ self.word_bigrams: Dict[str, float] = {}
133
 
134
+ # === Embedding (from sentence-transformers) ===
135
+ self.style_embedding: Optional[np.ndarray] = None
136
+
137
+ # === TF-IDF Vector ===
138
+ self.tfidf_vector: Optional[np.ndarray] = None
139
+
140
+ # === Combined Feature Vector ===
141
+ self.feature_vector: Optional[np.ndarray] = None
142
 
143
  def to_dict(self) -> dict:
144
  return {
 
149
  'avg_word_length': round(self.avg_word_length, 2),
150
  'hebrew_ratio': round(self.hebrew_ratio, 3),
151
  'english_ratio': round(self.english_ratio, 3),
152
+ 'emoji_ratio': round(self.emoji_ratio, 4),
153
+ 'formality_score': round(self.formality_score, 2),
154
+ 'slang_rate': round(self.slang_rate, 3),
155
  'question_rate': round(self.question_rate, 3),
156
  'exclamation_rate': round(self.exclamation_rate, 3),
 
157
  'repeated_chars_rate': round(self.repeated_chars_rate, 3),
158
  'weekend_ratio': round(self.weekend_ratio, 3),
159
+ 'night_owl_ratio': round(self.night_owl_ratio, 3),
160
  'unique_word_ratio': round(self.unique_word_ratio, 3),
161
  }
162
 
163
 
164
+ class AdvancedStylometryAnalyzer:
165
+ """
166
+ ML-powered stylometry analyzer using:
167
+ - sentence-transformers for Hebrew writing style embeddings
168
+ - scikit-learn for TF-IDF and DBSCAN clustering
169
+ - Hebrew linguistic feature extraction
170
+ """
171
 
172
  def __init__(self, db_path: str = 'telegram_data.db'):
173
  self.db_path = db_path
174
+ self.user_features: Dict[int, AdvancedStyleFeatures] = {}
175
+ self.similarity_threshold = 0.85
176
+
177
+ # ML components (lazy loaded)
178
+ self._embedding_model = None
179
+ self._tfidf_vectorizer = None
180
+ self._scaler = None
181
+
182
+ # Cache directory
183
+ self.cache_dir = os.path.dirname(os.path.abspath(__file__))
184
+
185
+ @property
186
+ def embedding_model(self):
187
+ """Lazy load sentence-transformers model."""
188
+ if self._embedding_model is None:
189
+ try:
190
+ from sentence_transformers import SentenceTransformer
191
+ # Use multilingual model that supports Hebrew well
192
+ # Alternative: 'imvladikon/sentence-transformers-alephbert' for pure Hebrew
193
+ print("Loading Hebrew embedding model...")
194
+ self._embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
195
+ print("Embedding model loaded.")
196
+ except Exception as e:
197
+ print(f"Could not load embedding model: {e}")
198
+ self._embedding_model = False # Mark as failed
199
+ return self._embedding_model if self._embedding_model else None
200
 
201
  def get_active_users(self, min_messages: int = 300, days: int = 365) -> List[Tuple[int, str, int]]:
202
  """Get users active in the last N days with at least min_messages."""
 
242
 
243
  return messages
244
 
245
+ def extract_features(self, user_id: int, user_name: str,
246
+ messages: List[Tuple[str, str]]) -> AdvancedStyleFeatures:
247
+ """Extract comprehensive stylometric features from user messages."""
248
+ features = AdvancedStyleFeatures(user_id, user_name)
249
  features.message_count = len(messages)
250
 
251
  if not messages:
252
  return features
253
 
254
+ # Collect all text for analysis
255
+ all_texts = [msg[0] for msg in messages if msg[0]]
256
+ all_text_combined = ' '.join(all_texts)
257
+
258
+ # === Basic Statistics ===
259
+ message_lengths = [len(text) for text in all_texts]
260
+ features.avg_message_length = np.mean(message_lengths)
261
+ features.std_message_length = np.std(message_lengths)
262
+
263
  all_words = []
264
+ word_counts_per_msg = []
265
+ for text in all_texts:
266
+ words = text.split()
267
+ all_words.extend(words)
268
+ word_counts_per_msg.append(len(words))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ if all_words:
271
+ word_lengths = [len(w) for w in all_words]
272
+ features.avg_word_length = np.mean(word_lengths)
273
+ features.avg_words_per_message = np.mean(word_counts_per_msg)
274
 
275
+ # === Character Ratios ===
276
+ total_chars = len(all_text_combined)
277
+ if total_chars > 0:
278
+ hebrew_chars = len(HEBREW_PATTERN.findall(all_text_combined))
279
+ english_chars = len(ENGLISH_PATTERN.findall(all_text_combined))
280
+ digit_chars = sum(1 for c in all_text_combined if c.isdigit())
281
+ punct_chars = sum(1 for c in all_text_combined if c in '.,!?;:()[]{}')
282
+ emoji_count = len(EMOJI_PATTERN.findall(all_text_combined))
283
 
284
+ features.hebrew_ratio = hebrew_chars / total_chars
285
+ features.english_ratio = english_chars / total_chars
286
+ features.digit_ratio = digit_chars / total_chars
287
+ features.punctuation_ratio = punct_chars / total_chars
288
+ features.emoji_ratio = emoji_count / total_chars
289
 
290
+ # === Punctuation Patterns ===
291
+ n_msgs = len(messages)
292
+ features.comma_rate = all_text_combined.count(',') / n_msgs
293
+ features.period_rate = all_text_combined.count('.') / n_msgs
294
+ features.question_rate = all_text_combined.count('?') / n_msgs
295
+ features.exclamation_rate = all_text_combined.count('!') / n_msgs
296
+ features.ellipsis_rate = all_text_combined.count('...') / n_msgs
297
+ features.quote_rate = (all_text_combined.count('"') + all_text_combined.count("'")) / n_msgs
298
+
299
+ # === Hebrew-Specific Features ===
300
+ text_lower = all_text_combined.lower()
301
+
302
+ # Formality score
303
+ formal_count = sum(1 for marker in FORMAL_MARKERS if marker in all_text_combined)
304
+ informal_count = sum(1 for marker in INFORMAL_MARKERS if marker in text_lower)
305
+ total_markers = formal_count + informal_count
306
+ if total_markers > 0:
307
+ features.formality_score = (formal_count - informal_count) / total_markers
308
+
309
+ # Slang rate
310
+ slang_count = sum(1 for text in all_texts for slang in HEBREW_SLANG if slang in text)
311
+ features.slang_rate = slang_count / n_msgs
312
+
313
+ # Acronym rate
314
+ acronym_count = sum(1 for text in all_texts for acr in HEBREW_ACRONYMS if acr in text)
315
+ features.acronym_rate = acronym_count / n_msgs
316
+
317
+ # Repeated characters (emotional expression like 讞讞讞讞)
318
+ repeated_msgs = sum(1 for text in all_texts if REPEATED_CHARS_PATTERN.search(text))
319
+ features.repeated_chars_rate = repeated_msgs / n_msgs
320
+
321
+ # Leet speak rate
322
+ leet_count = sum(len(LEET_PATTERN.findall(text)) for text in all_texts)
323
+ features.leet_speak_rate = leet_count / n_msgs
324
+
325
+ # === Linguistic Patterns ===
326
+ # Function word frequency
327
+ word_counter = Counter(w.lower() for w in all_words)
328
+ total_words = len(all_words)
329
+ for fw in HEBREW_FUNCTION_WORDS:
330
+ features.function_word_freq[fw] = word_counter.get(fw, 0) / max(1, total_words)
331
+
332
+ # Vocabulary richness
333
+ unique_words = set(w.lower() for w in all_words)
334
+ features.unique_word_ratio = len(unique_words) / max(1, total_words)
335
+
336
+ # Hapax legomena (words appearing only once)
337
+ hapax_count = sum(1 for w, c in word_counter.items() if c == 1)
338
+ features.hapax_ratio = hapax_count / max(1, len(unique_words))
339
+
340
+ # Message length categories
341
+ features.short_message_ratio = sum(1 for wc in word_counts_per_msg if wc < 5) / n_msgs
342
+ features.long_message_ratio = sum(1 for wc in word_counts_per_msg if wc > 30) / n_msgs
343
+
344
+ # === Time Patterns ===
345
+ hour_counts = np.zeros(24)
346
+ weekday_counts = np.zeros(7)
347
+ night_msgs = 0
348
+ weekend_msgs = 0
349
+
350
+ for text, date_str in messages:
351
  try:
352
  if 'T' in date_str:
353
  dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
354
  else:
355
  dt = datetime.strptime(date_str[:19], '%Y-%m-%d %H:%M:%S')
356
+
357
  hour_counts[dt.hour] += 1
358
+ weekday_counts[dt.weekday()] += 1
359
+
360
+ if 0 <= dt.hour < 6:
361
+ night_msgs += 1
362
  if dt.weekday() >= 5: # Saturday=5, Sunday=6
363
  weekend_msgs += 1
364
  except:
365
  pass
366
 
367
+ # Normalize
368
+ if hour_counts.sum() > 0:
369
+ features.hour_distribution = hour_counts / hour_counts.sum()
370
+ if weekday_counts.sum() > 0:
371
+ features.weekday_distribution = weekday_counts / weekday_counts.sum()
 
 
 
 
 
 
 
 
 
372
 
373
+ features.weekend_ratio = weekend_msgs / n_msgs
374
+ features.night_owl_ratio = night_msgs / n_msgs
 
 
 
 
 
 
 
 
375
 
376
+ # === N-gram Features ===
377
+ # Character bigrams
378
+ char_bigram_counter = Counter()
379
+ for text in all_texts:
380
+ clean_text = re.sub(r'\s+', ' ', text.lower())
381
+ for i in range(len(clean_text) - 1):
382
+ bg = clean_text[i:i+2]
383
+ if bg.strip():
384
+ char_bigram_counter[bg] += 1
 
 
 
 
 
 
 
 
 
 
 
 
385
 
 
386
  total_bigrams = sum(char_bigram_counter.values())
387
  if total_bigrams > 0:
388
+ for bg, count in char_bigram_counter.most_common(100):
389
+ features.char_bigrams[bg] = count / total_bigrams
390
 
391
+ # Character trigrams
392
+ char_trigram_counter = Counter()
393
+ for text in all_texts:
394
+ clean_text = re.sub(r'\s+', ' ', text.lower())
395
+ for i in range(len(clean_text) - 2):
396
+ tg = clean_text[i:i+3]
397
+ if tg.strip():
398
+ char_trigram_counter[tg] += 1
399
+
400
+ total_trigrams = sum(char_trigram_counter.values())
401
+ if total_trigrams > 0:
402
+ for tg, count in char_trigram_counter.most_common(100):
403
+ features.char_trigrams[tg] = count / total_trigrams
404
+
405
+ # Word bigrams
406
+ word_bigram_counter = Counter()
407
+ for text in all_texts:
408
+ words = text.lower().split()
409
+ for i in range(len(words) - 1):
410
+ wb = f"{words[i]} {words[i+1]}"
411
+ word_bigram_counter[wb] += 1
412
+
413
+ total_word_bigrams = sum(word_bigram_counter.values())
414
+ if total_word_bigrams > 0:
415
+ for wb, count in word_bigram_counter.most_common(50):
416
+ features.word_bigrams[wb] = count / total_word_bigrams
417
+
418
+ # === Generate Style Embedding ===
419
+ if self.embedding_model:
420
+ try:
421
+ # Sample messages for embedding (limit for performance)
422
+ sample_texts = all_texts[:100] if len(all_texts) > 100 else all_texts
423
+ # Combine into a style sample
424
+ style_sample = ' '.join(sample_texts)[:5000] # Limit length
425
+ features.style_embedding = self.embedding_model.encode(style_sample, show_progress_bar=False)
426
+ except Exception as e:
427
+ print(f"Embedding error for user {user_id}: {e}")
428
+
429
+ # === Build Numeric Feature Vector ===
430
  features.feature_vector = self._build_feature_vector(features)
431
 
432
  return features
433
 
434
+ def _build_feature_vector(self, f: AdvancedStyleFeatures) -> np.ndarray:
435
  """Build normalized feature vector for similarity comparison."""
436
  vector = [
437
+ # Basic stats (normalized)
438
+ f.avg_message_length / 200,
439
+ f.std_message_length / 100,
440
  f.avg_word_length / 10,
441
+ f.avg_words_per_message / 20,
442
+
443
+ # Character ratios
444
  f.hebrew_ratio,
445
  f.english_ratio,
446
+ f.digit_ratio * 10,
447
+ f.emoji_ratio * 100,
448
+ f.punctuation_ratio * 10,
449
+
450
+ # Punctuation patterns
451
+ f.comma_rate / 2,
452
+ f.period_rate / 2,
453
  f.question_rate,
454
  f.exclamation_rate,
455
  f.ellipsis_rate * 5,
456
+ f.quote_rate,
457
+
458
+ # Hebrew-specific
459
+ f.formality_score,
460
+ f.slang_rate * 5,
461
+ f.acronym_rate * 10,
462
+ f.repeated_chars_rate * 5,
463
+ f.leet_speak_rate * 10,
464
+
465
+ # Linguistic
466
  f.unique_word_ratio,
467
+ f.hapax_ratio,
468
  f.short_message_ratio,
469
+ f.long_message_ratio,
470
+
471
+ # Time patterns
472
+ f.weekend_ratio,
473
+ f.night_owl_ratio * 5,
474
  ]
475
 
476
  # Add hour distribution (24 values)
477
+ vector.extend(f.hour_distribution.tolist())
 
 
478
 
479
+ # Add weekday distribution (7 values)
480
+ vector.extend(f.weekday_distribution.tolist())
 
 
481
 
482
+ # Add top function word frequencies (20 values)
483
+ for fw in HEBREW_FUNCTION_WORDS[:20]:
484
+ vector.append(f.function_word_freq.get(fw, 0) * 100)
485
 
486
+ return np.array(vector)
 
 
 
487
 
488
+ def calculate_similarity(self, f1: AdvancedStyleFeatures, f2: AdvancedStyleFeatures) -> Tuple[float, Dict]:
489
+ """
490
+ Calculate comprehensive similarity between two users.
491
+ Returns overall score and component breakdown.
492
+ """
493
+ scores = {}
494
+
495
+ # 1. Feature vector similarity (cosine)
496
+ if f1.feature_vector is not None and f2.feature_vector is not None:
497
+ v1, v2 = f1.feature_vector, f2.feature_vector
498
+ dot_product = np.dot(v1, v2)
499
+ norm1, norm2 = np.linalg.norm(v1), np.linalg.norm(v2)
500
+ if norm1 > 0 and norm2 > 0:
501
+ scores['feature_cosine'] = float(dot_product / (norm1 * norm2))
502
+ else:
503
+ scores['feature_cosine'] = 0.0
504
+ else:
505
+ scores['feature_cosine'] = 0.0
506
+
507
+ # 2. Embedding similarity (if available)
508
+ if f1.style_embedding is not None and f2.style_embedding is not None:
509
+ e1, e2 = f1.style_embedding, f2.style_embedding
510
+ dot_product = np.dot(e1, e2)
511
+ norm1, norm2 = np.linalg.norm(e1), np.linalg.norm(e2)
512
+ if norm1 > 0 and norm2 > 0:
513
+ scores['embedding_cosine'] = float(dot_product / (norm1 * norm2))
514
+ else:
515
+ scores['embedding_cosine'] = 0.0
516
+ else:
517
+ scores['embedding_cosine'] = None
518
+
519
+ # 3. Character bigram similarity (Jaccard-like)
520
+ scores['bigram_overlap'] = self._ngram_similarity(f1.char_bigrams, f2.char_bigrams)
521
+
522
+ # 4. Trigram similarity
523
+ scores['trigram_overlap'] = self._ngram_similarity(f1.char_trigrams, f2.char_trigrams)
524
+
525
+ # 5. Word bigram similarity
526
+ scores['word_bigram_overlap'] = self._ngram_similarity(f1.word_bigrams, f2.word_bigrams)
527
+
528
+ # 6. Time pattern similarity (hour distribution)
529
+ if f1.hour_distribution.sum() > 0 and f2.hour_distribution.sum() > 0:
530
+ scores['time_pattern'] = float(np.dot(f1.hour_distribution, f2.hour_distribution))
531
+ else:
532
+ scores['time_pattern'] = 0.0
533
+
534
+ # === Weighted combination ===
535
+ weights = {
536
+ 'feature_cosine': 0.25,
537
+ 'embedding_cosine': 0.30 if scores['embedding_cosine'] is not None else 0.0,
538
+ 'bigram_overlap': 0.15,
539
+ 'trigram_overlap': 0.10,
540
+ 'word_bigram_overlap': 0.10,
541
+ 'time_pattern': 0.10,
542
+ }
543
 
544
+ # Redistribute embedding weight if not available
545
+ if scores['embedding_cosine'] is None:
546
+ weights['feature_cosine'] += 0.15
547
+ weights['bigram_overlap'] += 0.10
548
+ weights['trigram_overlap'] += 0.05
549
 
550
+ overall = 0.0
551
+ for key, weight in weights.items():
552
+ if scores.get(key) is not None:
553
+ overall += weight * scores[key]
554
 
555
+ return overall, scores
 
556
 
557
+ def _ngram_similarity(self, ng1: Dict[str, float], ng2: Dict[str, float]) -> float:
558
+ """Calculate similarity between n-gram distributions."""
559
+ if not ng1 or not ng2:
560
  return 0.0
561
 
562
+ all_ngrams = set(ng1.keys()) | set(ng2.keys())
563
+ if not all_ngrams:
564
  return 0.0
565
 
 
566
  intersection = 0.0
567
  union = 0.0
568
 
569
+ for ng in all_ngrams:
570
+ v1 = ng1.get(ng, 0)
571
+ v2 = ng2.get(ng, 0)
572
  intersection += min(v1, v2)
573
  union += max(v1, v2)
574
 
 
577
 
578
  return intersection / union
579
 
580
+ def cluster_users(self, min_cluster_size: int = 2) -> List[List[int]]:
581
+ """
582
+ Use DBSCAN to automatically cluster users with similar writing styles.
583
+ Returns list of clusters (each cluster is a list of user_ids).
584
+ """
585
+ if len(self.user_features) < 2:
586
+ return []
587
+
588
+ try:
589
+ from sklearn.cluster import DBSCAN
590
+ from sklearn.preprocessing import StandardScaler
591
+ except ImportError:
592
+ print("scikit-learn not available for clustering")
593
+ return []
594
+
595
+ # Build feature matrix
596
+ user_ids = list(self.user_features.keys())
597
+ feature_matrix = []
598
+
599
+ for uid in user_ids:
600
+ f = self.user_features[uid]
601
+ if f.feature_vector is not None:
602
+ # Combine feature vector with embedding if available
603
+ if f.style_embedding is not None:
604
+ combined = np.concatenate([f.feature_vector, f.style_embedding])
605
+ else:
606
+ combined = f.feature_vector
607
+ feature_matrix.append(combined)
608
+ else:
609
+ feature_matrix.append(np.zeros(50)) # Fallback
610
+
611
+ feature_matrix = np.array(feature_matrix)
612
+
613
+ # Normalize features
614
+ scaler = StandardScaler()
615
+ features_scaled = scaler.fit_transform(feature_matrix)
616
+
617
+ # DBSCAN clustering
618
+ # eps: maximum distance between samples in a cluster
619
+ # min_samples: minimum samples to form a cluster
620
+ dbscan = DBSCAN(eps=0.5, min_samples=min_cluster_size, metric='cosine')
621
+ labels = dbscan.fit_predict(features_scaled)
622
+
623
+ # Group users by cluster
624
+ clusters = defaultdict(list)
625
+ for i, label in enumerate(labels):
626
+ if label >= 0: # -1 means noise (no cluster)
627
+ clusters[label].append(user_ids[i])
628
+
629
+ return [users for users in clusters.values() if len(users) >= min_cluster_size]
630
+
631
  def analyze_all_users(self, min_messages: int = 300, days: int = 365,
632
+ progress_callback=None) -> Dict:
633
  """Analyze all active users and find potential duplicates."""
634
 
635
  # Get active users
 
647
  self.user_features[user_id] = features
648
 
649
  if progress_callback:
650
+ progress_callback('user_processed', idx + 1, total_users, user_name or f"User_{user_id}")
651
 
652
  # Find similar pairs
653
  if progress_callback:
 
663
  uid1, uid2 = user_ids[i], user_ids[j]
664
  f1, f2 = self.user_features[uid1], self.user_features[uid2]
665
 
666
+ similarity, score_breakdown = self.calculate_similarity(f1, f2)
667
 
668
  if similarity >= self.similarity_threshold:
669
  similar_pairs.append({
670
  'user1': f1.to_dict(),
671
  'user2': f2.to_dict(),
672
  'similarity': round(similarity * 100, 1),
673
+ 'scores': {k: round(v * 100, 1) if v is not None else None
674
+ for k, v in score_breakdown.items()},
675
+ 'details': self._get_similarity_details(f1, f2, score_breakdown)
676
  })
677
 
678
  comparison_count += 1
 
682
  # Sort by similarity (highest first)
683
  similar_pairs.sort(key=lambda x: x['similarity'], reverse=True)
684
 
685
+ # Run clustering
686
+ clusters = self.cluster_users(min_cluster_size=2)
687
+ cluster_info = []
688
+ for cluster in clusters:
689
+ cluster_users = [self.user_features[uid].to_dict() for uid in cluster]
690
+ cluster_info.append({
691
+ 'users': cluster_users,
692
+ 'size': len(cluster)
693
+ })
694
+
695
  return {
696
  'total_users_analyzed': total_users,
697
  'threshold': self.similarity_threshold * 100,
698
  'potential_duplicates': len(similar_pairs),
699
  'pairs': similar_pairs,
700
+ 'clusters': cluster_info,
701
+ 'all_users': [f.to_dict() for f in self.user_features.values()],
702
+ 'embedding_model_used': self.embedding_model is not None,
703
  }
704
 
705
+ def _get_similarity_details(self, f1: AdvancedStyleFeatures, f2: AdvancedStyleFeatures,
706
+ scores: Dict) -> List[str]:
707
+ """Get human-readable similarity details in Hebrew."""
708
  details = []
709
 
710
+ # High embedding similarity
711
+ if scores.get('embedding_cosine') and scores['embedding_cosine'] > 0.85:
712
+ details.append("住讙谞讜谉 讻转讬讘讛 讚讜诪讛 诪讗讜讚 (AI embedding)")
713
+
714
+ # Message length
715
  len_diff = abs(f1.avg_message_length - f2.avg_message_length)
716
+ if len_diff < 15:
717
  details.append(f"讗讜专讱 讛讜讚注讛 讚讜诪讛 ({f1.avg_message_length:.0f} vs {f2.avg_message_length:.0f})")
718
 
719
  # Hebrew/English ratio
720
  heb_diff = abs(f1.hebrew_ratio - f2.hebrew_ratio)
721
  if heb_diff < 0.1:
722
+ details.append(f"讬讞住 注讘专讬转 讚讜诪讛 ({f1.hebrew_ratio:.0%} vs {f2.hebrew_ratio:.0%})")
723
 
724
  # Emoji usage
725
  emoji_diff = abs(f1.emoji_ratio - f2.emoji_ratio)
726
+ if emoji_diff < 0.005 and (f1.emoji_ratio > 0.001 or f2.emoji_ratio > 0.001):
727
  details.append("砖讬诪讜砖 讚讜诪讛 讘讗讬诪讜讙'讬")
728
 
729
+ # Formality
730
+ form_diff = abs(f1.formality_score - f2.formality_score)
731
+ if form_diff < 0.3:
732
+ if f1.formality_score > 0.3:
733
+ details.append("砖谞讬讛诐 讻讜转讘讬诐 讘住讙谞讜谉 驻讜专诪诇讬")
734
+ elif f1.formality_score < -0.3:
735
+ details.append("砖谞讬讛诐 讻讜转讘讬诐 讘住讙谞讜谉 诇讗 驻讜专诪诇讬")
736
 
737
+ # Slang usage
738
+ if abs(f1.slang_rate - f2.slang_rate) < 0.1:
739
+ if f1.slang_rate > 0.2:
740
+ details.append(" 讚讜诪讛 讘住诇谞讙")
741
 
742
  # Repeated characters
743
  if abs(f1.repeated_chars_rate - f2.repeated_chars_rate) < 0.05:
744
  if f1.repeated_chars_rate > 0.1:
745
+ details.append("砖谞讬讛诐 诪砖转诪砖讬诐 讘转讜讜讬诐 讞讜讝专讬诐 (讻诪讜 讞讞讞讞)")
746
 
747
  # Time patterns
748
+ if scores.get('time_pattern', 0) > 0.8:
749
+ details.append("讚驻讜住 砖注讜转 驻注讬诇讜转 讚讜诪讛 诪讗讜讚")
750
+
751
+ # Weekend activity
752
+ weekend_diff = abs(f1.weekend_ratio - f2.weekend_ratio)
753
+ if weekend_diff < 0.1:
754
+ details.append("驻注讬诇讜转 讚讜诪讛 讘住讜驻\"砖")
755
+
756
+ # Night owl
757
+ if abs(f1.night_owl_ratio - f2.night_owl_ratio) < 0.05:
758
+ if f1.night_owl_ratio > 0.1:
759
+ details.append("砖谞讬讛诐 驻注讬诇讬诐 讘砖注讜转 讛诇讬诇讛")
760
+
761
+ # N-gram overlap
762
+ if scores.get('bigram_overlap', 0) > 0.6:
763
+ details.append("讚驻讜住讬 讗讜转讬讜转 讚讜诪讬诐 诪讗讜讚")
764
+
765
+ if scores.get('word_bigram_overlap', 0) > 0.4:
766
+ details.append("爪讬专讜驻讬 诪讬诇讬诐 讚讜诪讬诐")
767
 
768
  return details
769
 
770
 
771
  # Singleton instance
772
+ _analyzer_instance: Optional[AdvancedStylometryAnalyzer] = None
773
 
774
+ def get_stylometry_analyzer() -> AdvancedStylometryAnalyzer:
775
  """Get or create the stylometry analyzer singleton."""
776
  global _analyzer_instance
777
  if _analyzer_instance is None:
778
+ _analyzer_instance = AdvancedStylometryAnalyzer()
779
  return _analyzer_instance
templates/maintenance.html CHANGED
@@ -275,6 +275,50 @@
275
  margin-top: 5px;
276
  }
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  /* Pairs List */
279
  .pairs-list {
280
  display: flex;
@@ -472,10 +516,12 @@
472
  <main class="main-content locked" id="main-content">
473
  <!-- Stylometry Analysis Section -->
474
  <section class="section">
475
- <h2>讝讬讛讜讬 诪砖转诪砖讬诐 讻驻讜诇讬诐 (Stylometry)</h2>
476
  <p>
477
- 诪注专讻转 诪讞转 讗转 住讙谞讜谉 讛讻转讬讘讛 砖诇 讻诇 诪砖 讻讚讬 诇讝讛讜 讞砖讘讜谞讜转 砖讬讬转讬讻讬诐 诇讗讜讜 讗讚诐.
478
- 讛谞讬转讜讞 讻讜诇诇: 讗讜专讱 讛讜讚注讜转, 砖讬诪讜砖 讘讗讬诪讜讙'讬, 住讬诪谞讬 驻讬住讜拽, 讬讞住 注讘专讬转/讗谞讙诇讬转, 砖注讜转 驻注讬诇讜转 讜注讜讚.
 
 
479
  </p>
480
 
481
  <div class="controls">
@@ -628,6 +674,10 @@
628
  container.classList.add('active');
629
 
630
  // Stats
 
 
 
 
631
  statsGrid.innerHTML = `
632
  <div class="stat-card">
633
  <div class="value">${data.total_users_analyzed}</div>
@@ -635,12 +685,20 @@
635
  </div>
636
  <div class="stat-card">
637
  <div class="value">${data.potential_duplicates}</div>
638
- <div class="label">讞砖讜讚讬诐 讻讻驻讜诇讬诐</div>
 
 
 
 
639
  </div>
640
  <div class="stat-card">
641
  <div class="value">${data.threshold}%</div>
642
  <div class="label">住祝 讚诪讬讜谉</div>
643
  </div>
 
 
 
 
644
  `;
645
 
646
  // Pairs
@@ -700,37 +758,94 @@
700
  <td>${(pair.user1.hebrew_ratio * 100).toFixed(1)}%</td>
701
  <td>${(pair.user2.hebrew_ratio * 100).toFixed(1)}%</td>
702
  </tr>
703
- <tr>
704
- <td>讬讞住 讗谞讙诇讬转</td>
705
- <td>${(pair.user1.english_ratio * 100).toFixed(1)}%</td>
706
- <td>${(pair.user2.english_ratio * 100).toFixed(1)}%</td>
707
- </tr>
708
  <tr>
709
  <td>砖讬诪讜砖 讘讗讬诪讜讙'讬</td>
710
  <td>${(pair.user1.emoji_ratio * 100).toFixed(2)}%</td>
711
  <td>${(pair.user2.emoji_ratio * 100).toFixed(2)}%</td>
712
  </tr>
713
  <tr>
714
- <td>住讬谞讬 砖讗讛 (诇讻诇 讛讚注讛)</td>
715
- <td>${pair.user1.question_rate.toFixed(2)}</td>
716
- <td>${pair.user2.question_rate.toFixed(2)}</td>
 
 
 
 
 
717
  </tr>
718
  <tr>
719
- <td>诪谞讬 专讬讗讛 (诇讻诇 讛讜讚注讛)</td>
720
- <td>${pair.user1.exclamation_rate.toFixed(2)}</td>
721
- <td>${pair.user2.exclamation_rate.toFixed(2)}</td>
722
  </tr>
723
  <tr>
724
  <td>驻注讬诇讜转 讘住讜驻"砖</td>
725
  <td>${(pair.user1.weekend_ratio * 100).toFixed(1)}%</td>
726
  <td>${(pair.user2.weekend_ratio * 100).toFixed(1)}%</td>
727
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  </table>
729
  </div>
730
  `;
731
  }
732
 
733
  pairsHTML += '</div>';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  pairsContainer.innerHTML = pairsHTML;
735
  }
736
  }
 
275
  margin-top: 5px;
276
  }
277
 
278
+ .stat-card .value.available {
279
+ color: #66bb6a;
280
+ }
281
+
282
+ .stat-card .value.unavailable {
283
+ color: #ff6b6b;
284
+ }
285
+
286
+ /* Clusters Section */
287
+ .clusters-section {
288
+ margin-top: 30px;
289
+ }
290
+
291
+ .clusters-section h3 {
292
+ color: #ff6b6b;
293
+ margin-bottom: 15px;
294
+ }
295
+
296
+ .cluster-card {
297
+ background: rgba(102, 187, 106, 0.1);
298
+ border: 1px solid rgba(102, 187, 106, 0.3);
299
+ border-radius: 10px;
300
+ padding: 15px;
301
+ margin-bottom: 15px;
302
+ }
303
+
304
+ .cluster-card h4 {
305
+ color: #66bb6a;
306
+ margin-bottom: 10px;
307
+ }
308
+
309
+ .cluster-users {
310
+ display: flex;
311
+ flex-wrap: wrap;
312
+ gap: 10px;
313
+ }
314
+
315
+ .cluster-user {
316
+ background: rgba(0, 0, 0, 0.3);
317
+ padding: 8px 15px;
318
+ border-radius: 20px;
319
+ font-size: 0.9rem;
320
+ }
321
+
322
  /* Pairs List */
323
  .pairs-list {
324
  display: flex;
 
516
  <main class="main-content locked" id="main-content">
517
  <!-- Stylometry Analysis Section -->
518
  <section class="section">
519
+ <h2>讝讬讛讜讬 诪砖转诪砖讬诐 讻驻讜诇讬诐 (Advanced Stylometry + AI)</h2>
520
  <p>
521
+ 诪注专讻转 诪转拽讚诪转 诇讝讛讜 讞砖讘讜谞讜转 讻讬诐 讛诪砖:
522
+ <strong>AI Embeddings</strong> (sentence-transformers),
523
+ <strong>DBSCAN Clustering</strong> (scikit-learn),
524
+ 讜谞讬转讜讞 诇砖讜谞讬 注讘专讬 诪转拽讚诐 (驻讜专诪诇讬讜转, 住诇谞讙, 专讗砖讬 转讬讘讜转, 讚驻讜住讬 讝诪谉).
525
  </p>
526
 
527
  <div class="controls">
 
674
  container.classList.add('active');
675
 
676
  // Stats
677
+ const clusterCount = data.clusters ? data.clusters.length : 0;
678
+ const aiUsed = data.embedding_model_used ? '&#10003;' : '&#10007;';
679
+ const aiClass = data.embedding_model_used ? 'available' : 'unavailable';
680
+
681
  statsGrid.innerHTML = `
682
  <div class="stat-card">
683
  <div class="value">${data.total_users_analyzed}</div>
 
685
  </div>
686
  <div class="stat-card">
687
  <div class="value">${data.potential_duplicates}</div>
688
+ <div class="label">讝讜讙讜转 讞砖讜讚讬诐</div>
689
+ </div>
690
+ <div class="stat-card">
691
+ <div class="value">${clusterCount}</div>
692
+ <div class="label">拽讘讜爪讜转 DBSCAN</div>
693
  </div>
694
  <div class="stat-card">
695
  <div class="value">${data.threshold}%</div>
696
  <div class="label">住祝 讚诪讬讜谉</div>
697
  </div>
698
+ <div class="stat-card">
699
+ <div class="value ${aiClass}">${aiUsed}</div>
700
+ <div class="label">AI Embeddings</div>
701
+ </div>
702
  `;
703
 
704
  // Pairs
 
758
  <td>${(pair.user1.hebrew_ratio * 100).toFixed(1)}%</td>
759
  <td>${(pair.user2.hebrew_ratio * 100).toFixed(1)}%</td>
760
  </tr>
 
 
 
 
 
761
  <tr>
762
  <td>砖讬诪讜砖 讘讗讬诪讜讙'讬</td>
763
  <td>${(pair.user1.emoji_ratio * 100).toFixed(2)}%</td>
764
  <td>${(pair.user2.emoji_ratio * 100).toFixed(2)}%</td>
765
  </tr>
766
  <tr>
767
+ <td> 驻讜专诪</td>
768
+ <td>${pair.user1.formality_score > 0 ? '驻讜专诪诇讬' : (pair.user1.formality_score < 0 ? '诇讗 驻讜专诪诇讬' : '谞讬讬讟专诇讬')}</td>
769
+ <td>${pair.user2.formality_score > 0 ? '驻讜专诪诇讬' : (pair.user2.formality_score < 0 ? '诇讗 驻讜专诪诇讬' : '谞讬讬讟专诇讬')}</td>
770
+ </tr>
771
+ <tr>
772
+ <td>砖讬诪讜砖 讘住诇谞讙</td>
773
+ <td>${(pair.user1.slang_rate * 100).toFixed(1)}%</td>
774
+ <td>${(pair.user2.slang_rate * 100).toFixed(1)}%</td>
775
  </tr>
776
  <tr>
777
+ <td>转讜讜 讞讜讝专讬 (讞讞讞讞)</td>
778
+ <td>${(pair.user1.repeated_chars_rate * 100).toFixed(1)}%</td>
779
+ <td>${(pair.user2.repeated_chars_rate * 100).toFixed(1)}%</td>
780
  </tr>
781
  <tr>
782
  <td>驻注讬诇讜转 讘住讜驻"砖</td>
783
  <td>${(pair.user1.weekend_ratio * 100).toFixed(1)}%</td>
784
  <td>${(pair.user2.weekend_ratio * 100).toFixed(1)}%</td>
785
  </tr>
786
+ <tr>
787
+ <td>驻注讬诇讜转 诇讬诇讬转 (00-06)</td>
788
+ <td>${(pair.user1.night_owl_ratio * 100).toFixed(1)}%</td>
789
+ <td>${(pair.user2.night_owl_ratio * 100).toFixed(1)}%</td>
790
+ </tr>
791
+ <tr>
792
+ <td>注讜砖专 讗讜爪专 诪讬诇讬诐</td>
793
+ <td>${(pair.user1.unique_word_ratio * 100).toFixed(1)}%</td>
794
+ <td>${(pair.user2.unique_word_ratio * 100).toFixed(1)}%</td>
795
+ </tr>
796
+ ${pair.scores ? `
797
+ <tr style="background: rgba(255,107,107,0.1);">
798
+ <td colspan="3" style="text-align: center; color: #ff6b6b; font-weight: bold;">爪讬讜谞讬 讚诪讬讜谉 诇驻讬 专讻讬讘</td>
799
+ </tr>
800
+ <tr>
801
+ <td>Feature Vector</td>
802
+ <td colspan="2" style="text-align: center;">${pair.scores.feature_cosine || 0}%</td>
803
+ </tr>
804
+ <tr>
805
+ <td>AI Embedding</td>
806
+ <td colspan="2" style="text-align: center;">${pair.scores.embedding_cosine !== null ? pair.scores.embedding_cosine + '%' : 'N/A'}</td>
807
+ </tr>
808
+ <tr>
809
+ <td>Character Bigrams</td>
810
+ <td colspan="2" style="text-align: center;">${pair.scores.bigram_overlap || 0}%</td>
811
+ </tr>
812
+ <tr>
813
+ <td>Word Patterns</td>
814
+ <td colspan="2" style="text-align: center;">${pair.scores.word_bigram_overlap || 0}%</td>
815
+ </tr>
816
+ <tr>
817
+ <td>Time Pattern</td>
818
+ <td colspan="2" style="text-align: center;">${pair.scores.time_pattern || 0}%</td>
819
+ </tr>
820
+ ` : ''}
821
  </table>
822
  </div>
823
  `;
824
  }
825
 
826
  pairsHTML += '</div>';
827
+
828
+ // Add clusters section if available
829
+ if (data.clusters && data.clusters.length > 0) {
830
+ pairsHTML += `
831
+ <div class="clusters-section">
832
+ <h3>拽讘讜爪讜转 诪砖转诪砖讬诐 讚讜诪讬诐 (DBSCAN Clustering)</h3>
833
+ `;
834
+ data.clusters.forEach((cluster, idx) => {
835
+ pairsHTML += `
836
+ <div class="cluster-card">
837
+ <h4>拽讘讜爪讛 ${idx + 1} (${cluster.size} 诪砖转诪砖讬诐)</h4>
838
+ <div class="cluster-users">
839
+ ${cluster.users.map(u => `
840
+ <span class="cluster-user">${escapeHtml(u.user_name)} (${u.message_count})</span>
841
+ `).join('')}
842
+ </div>
843
+ </div>
844
+ `;
845
+ });
846
+ pairsHTML += '</div>';
847
+ }
848
+
849
  pairsContainer.innerHTML = pairsHTML;
850
  }
851
  }