agus1111 commited on
Commit
4330fcc
·
verified ·
1 Parent(s): b1fcbd1

Update botsignal.py

Browse files
Files changed (1) hide show
  1. botsignal.py +107 -49
botsignal.py CHANGED
@@ -35,6 +35,7 @@ SUPPORT_CHATS = [
35
  "https://t.me/TheDonALPHAJournal",
36
  "https://t.me/savascalls",
37
  "https://t.me/Tanjirocall",
 
38
  "https://t.me/ChapoInsider",
39
  "https://t.me/millionsgems",
40
  "https://t.me/Milagrosdegencalls",
@@ -93,6 +94,8 @@ def build_client() -> TelegramClient:
93
  client = build_client()
94
  recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
95
  recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
 
 
96
 
97
  # Peta id_chat -> "core" / "support"
98
  chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
@@ -109,7 +112,8 @@ def _db():
109
 
110
  def _init_db():
111
  conn = _db()
112
- conn.executescript("""
 
113
  CREATE TABLE IF NOT EXISTS last_posted (
114
  keyword TEXT PRIMARY KEY,
115
  msg_id INTEGER NOT NULL,
@@ -121,7 +125,8 @@ def _init_db():
121
  last_ts INTEGER NOT NULL,
122
  PRIMARY KEY (keyword, group_key)
123
  );
124
- """)
 
125
  conn.commit()
126
  conn.close()
127
 
@@ -139,17 +144,21 @@ def db_load_state():
139
 
140
  def db_save_last_posted(keyword: str, msg_id: int, tier: str):
141
  conn = _db()
142
- conn.execute("INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
143
- "ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
144
- (keyword, msg_id, tier))
 
 
145
  conn.commit()
146
  conn.close()
147
 
148
  def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
149
  conn = _db()
150
- conn.execute("INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
151
- "ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
152
- (keyword, group_key, int(ts.timestamp())))
 
 
153
  conn.commit()
154
  conn.close()
155
 
@@ -177,19 +186,21 @@ def _tokenize_words(s: str) -> List[str]:
177
 
178
  def _windows(tokens: List[str], size: int = 20):
179
  for i in range(0, len(tokens), size):
180
- yield " ".join(tokens[i:i+size])
181
 
182
  # --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
183
  CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
184
- CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
185
  CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
186
 
 
187
  def _strip_urls_and_mentions(s: str) -> str:
188
  s = re.sub(r"https?://\S+", "", s)
189
  s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
190
  s = re.sub(r"@[A-Za-z0-9_]+", "", s)
191
  return re.sub(r"\s+", " ", s).strip()
192
 
 
193
  def strip_contracts_for_scoring(s: str) -> str:
194
  """
195
  Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
@@ -201,6 +212,7 @@ def strip_contracts_for_scoring(s: str) -> str:
201
  s3 = CA_SOL_RE.sub(" ", s2)
202
  return re.sub(r"\s+", " ", s3).strip()
203
 
 
204
  def score_relevance(text: str, keywords: List[str]) -> float:
205
  """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
206
  if not text:
@@ -233,6 +245,7 @@ def score_relevance(text: str, keywords: List[str]) -> float:
233
 
234
  return exact_score + fuzzy_score
235
 
 
236
  def hash_for_dedup(text: str, msg) -> str:
237
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
238
  parts = [text or ""]
@@ -249,37 +262,17 @@ def hash_for_dedup(text: str, msg) -> str:
249
  raw = "|".join(parts).encode("utf-8", errors="ignore")
250
  return hashlib.sha1(raw).hexdigest()
251
 
 
252
  def content_only_hash(text: str) -> str:
253
  """Hash berbasis isi saja (untuk lintas-grup crosspost)."""
254
  norm = _strip_urls_and_mentions(normalize_for_filter(text))
255
  return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
256
 
257
- def is_image_message(msg) -> bool:
258
- if getattr(msg, "photo", None) is not None:
259
- return True
260
- doc = getattr(msg, "document", None)
261
- if doc and getattr(doc, "mime_type", None):
262
- mt = (doc.mime_type or "").lower()
263
- if mt.startswith("image/"):
264
- if SKIP_STICKERS and ("webp" in mt or "sticker" in mt):
265
- return False
266
- return True
267
- if not ALLOW_GIFS_VIDEOS:
268
- return False
269
- if mt in ("video/mp4", "image/gif"):
270
- return True
271
- return False
272
-
273
- def media_too_big(msg) -> bool:
274
- doc = getattr(msg, "document", None)
275
- if doc and getattr(doc, "size", None):
276
- return (doc.size or 0) > MAX_MEDIA_MB * 1024 * 1024
277
- return False
278
-
279
 
280
  # ========= Class aggregator (windowed unique groups) =========
281
  keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
282
 
 
283
  def _prune_expired(now: datetime) -> None:
284
  window = timedelta(minutes=CLASS_WINDOW_MINUTES)
285
  cutoff = now - window
@@ -293,6 +286,7 @@ def _prune_expired(now: datetime) -> None:
293
  # db prune
294
  db_prune_expired(cutoff)
295
 
 
296
  def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
297
  if not now:
298
  now = datetime.now(timezone.utc)
@@ -326,7 +320,7 @@ INVITE_PATTERNS = [
326
  r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
327
  r"(bit\.ly|tinyurl\.com|linktr\.ee)",
328
  # perluasan: link t.me biasa
329
- r"t\.me\/[A-Za-z0-9_]+"
330
  ]
331
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
332
 
@@ -338,6 +332,7 @@ WHITELIST_STRONG_SIGNAL = [
338
  ]
339
  WHITELIST_REGEXES = [re.compile(p, re.IGNORECASE) for p in WHITELIST_STRONG_SIGNAL]
340
 
 
341
  def _is_invite_sentence(s: str) -> bool:
342
  t = s.strip()
343
  if not t:
@@ -348,10 +343,11 @@ def _is_invite_sentence(s: str) -> bool:
348
  # Jika ada 1+ pola ajakan, buang
349
  return any(r.search(t) for r in INVITE_REGEXES)
350
 
 
351
  def filter_invite_sentences(text: str) -> str:
352
  if not text:
353
  return text
354
- parts = re.split(r'(?<=[\.\!\?])\s+|\n+', text, flags=re.UNICODE)
355
  kept = [p.strip() for p in parts if p and not _is_invite_sentence(p)]
356
  cleaned = "\n".join(kept).strip()
357
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
@@ -362,6 +358,7 @@ def filter_invite_sentences(text: str) -> str:
362
  TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
363
  last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
364
 
 
365
  async def _send_initial(msg, text: str) -> int:
366
  if DRY_RUN:
367
  print("[DRY_RUN] send_initial:", text[:140])
@@ -370,7 +367,9 @@ async def _send_initial(msg, text: str) -> int:
370
  if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
371
  try:
372
  if getattr(msg, "photo", None):
373
- m = await client.send_file(TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False)
 
 
374
  return m.id
375
  doc = getattr(msg, "document", None)
376
  if doc:
@@ -385,7 +384,9 @@ async def _send_initial(msg, text: str) -> int:
385
  ext_guess = ".jpg"
386
  ext = ext_guess
387
  bio.name = f"media{ext}"
388
- m = await client.send_file(TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False)
 
 
389
  return m.id
390
  except FloodWaitError as e:
391
  await asyncio.sleep(e.seconds + 1)
@@ -399,6 +400,7 @@ async def _send_initial(msg, text: str) -> int:
399
  await asyncio.sleep(e.seconds + 1)
400
  return await _send_initial(msg, text)
401
 
 
402
  async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
403
  prefix = f"[{new_tier.upper()}] "
404
  text = prefix + body
@@ -445,7 +447,9 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
445
  if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
446
  try:
447
  if getattr(msg, "photo", None):
448
- await client.send_file(TARGET_CHAT, msg.photo, caption=orig_text, caption_entities=entities, force_document=False)
 
 
449
  return
450
  doc = getattr(msg, "document", None)
451
  if doc:
@@ -460,7 +464,9 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
460
  ext_guess = ".jpg"
461
  ext = ext_guess
462
  bio.name = f"media{ext}"
463
- await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
 
 
464
  return
465
  except FloodWaitError as e:
466
  await asyncio.sleep(e.seconds + 1)
@@ -478,6 +484,7 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
478
  TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
479
  TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
480
 
 
481
  def _extract_tickers(text_norm: str) -> List[str]:
482
  """
483
  Ambil $TICKER dengan dua cara:
@@ -507,6 +514,7 @@ def _extract_tickers(text_norm: str) -> List[str]:
507
  seen.add(x)
508
  return uniq
509
 
 
510
  def _extract_all_keywords(text_norm: str) -> List[str]:
511
  """
512
  Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
@@ -533,6 +541,7 @@ def _extract_all_keywords(text_norm: str) -> List[str]:
533
  seen.add(kw)
534
  return uniq
535
 
 
536
  def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
537
  if not kws:
538
  return None
@@ -547,10 +556,12 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
547
  chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
548
  return chosen
549
 
 
550
  def _role_of(chat_id: int) -> str:
551
  # DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
552
  return chat_roles.get(chat_id, "support")
553
 
 
554
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
555
  """
556
  Hitung jumlah grup unik yang menyebut 'keyword' dalam window aktif,
@@ -564,6 +575,30 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
564
  return len(core_ids), len(sup_ids)
565
 
566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  async def process_message(msg, source_chat_id: int) -> None:
568
  """
569
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
@@ -578,6 +613,12 @@ async def process_message(msg, source_chat_id: int) -> None:
578
  debug_log("Dilewati karena EXCLUDE_PHRASES", orig_text)
579
  return
580
 
 
 
 
 
 
 
581
  # Content-only dedup (lintas grup)
582
  ch = content_only_hash(orig_text)
583
  if ch in recent_content_hashes:
@@ -600,28 +641,34 @@ async def process_message(msg, source_chat_id: int) -> None:
600
 
601
  role = _role_of(source_chat_id) # 'core' / 'support'
602
 
603
- # Multi-kw -> pilih satu dominan untuk agregasi
604
  all_kws = _extract_all_keywords(text_norm)
605
  main_kw = _choose_dominant_keyword(text_norm, all_kws)
606
- if not main_kw:
607
- debug_log("Tak ada keyword cocok, dilewati", orig_text)
 
 
 
608
  return
609
 
610
- # Agregasi & kelas
611
  group_key = str(source_chat_id)
612
  now = datetime.now(timezone.utc)
613
- class_label, unique_groups = update_and_classify(main_kw, group_key, now)
614
 
615
  # Gating SUPPORT (CORE-anchored)
616
  if role != "core":
617
- core_u, sup_u = _unique_counts_by_role(main_kw)
618
  # Aturan:
619
- # - Jika sudah ada minimal 1 sebutan dari CORE untuk keyword ini -> izinkan.
620
  # - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
621
  if core_u >= 1:
622
  pass
623
  elif sup_u < SUPPORT_MIN_UNIQUE:
624
- debug_log(f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})", orig_text)
 
 
 
625
  return
626
 
627
  # Filter kalimat ajakan (whitelist-aware)
@@ -631,15 +678,24 @@ async def process_message(msg, source_chat_id: int) -> None:
631
  return
632
 
633
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
634
- cutoff = startup_time_utc - timedelta(minutes=CLASS_WINDOW_MINUTES + BACKFILL_BUFFER_MINUTES)
 
 
635
  if getattr(msg, "date", None):
636
  msg_dt = msg.date
637
  if isinstance(msg_dt, datetime) and msg_dt.replace(tzinfo=timezone.utc) < cutoff:
638
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
639
  return
640
 
641
- await post_or_update(main_kw, cleaned_body, class_label, msg)
642
- debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
 
 
 
 
 
 
 
643
 
644
 
645
  async def backfill_history(entity, limit: int) -> None:
@@ -674,6 +730,7 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
674
  print(f"Gagal resolve sumber {src}: {e}")
675
  return resolved
676
 
 
677
  async def start_bot_background() -> None:
678
  await client.start()
679
  _init_db()
@@ -695,6 +752,7 @@ async def start_bot_background() -> None:
695
  print("Kurator berjalan (background task). Menunggu pesan baru...")
696
  asyncio.create_task(client.run_until_disconnected())
697
 
 
698
  async def app_main() -> None:
699
  await client.start()
700
  _init_db()
 
35
  "https://t.me/TheDonALPHAJournal",
36
  "https://t.me/savascalls",
37
  "https://t.me/Tanjirocall",
38
+ "https://t.me/Zen_call",
39
  "https://t.me/ChapoInsider",
40
  "https://t.me/millionsgems",
41
  "https://t.me/Milagrosdegencalls",
 
94
  client = build_client()
95
  recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
96
  recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
97
+ # New: entity-based dedup (CA/$ticker)
98
+ recent_entity_keys: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
99
 
100
  # Peta id_chat -> "core" / "support"
101
  chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
 
112
 
113
  def _init_db():
114
  conn = _db()
115
+ conn.executescript(
116
+ """
117
  CREATE TABLE IF NOT EXISTS last_posted (
118
  keyword TEXT PRIMARY KEY,
119
  msg_id INTEGER NOT NULL,
 
125
  last_ts INTEGER NOT NULL,
126
  PRIMARY KEY (keyword, group_key)
127
  );
128
+ """
129
+ )
130
  conn.commit()
131
  conn.close()
132
 
 
144
 
145
  def db_save_last_posted(keyword: str, msg_id: int, tier: str):
146
  conn = _db()
147
+ conn.execute(
148
+ "INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
149
+ "ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
150
+ (keyword, msg_id, tier),
151
+ )
152
  conn.commit()
153
  conn.close()
154
 
155
  def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
156
  conn = _db()
157
+ conn.execute(
158
+ "INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
159
+ "ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
160
+ (keyword, group_key, int(ts.timestamp())),
161
+ )
162
  conn.commit()
163
  conn.close()
164
 
 
186
 
187
  def _windows(tokens: List[str], size: int = 20):
188
  for i in range(0, len(tokens), size):
189
+ yield " ".join(tokens[i : i + size])
190
 
191
  # --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
192
  CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
193
+ CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
194
  CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
195
 
196
+
197
  def _strip_urls_and_mentions(s: str) -> str:
198
  s = re.sub(r"https?://\S+", "", s)
199
  s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
200
  s = re.sub(r"@[A-Za-z0-9_]+", "", s)
201
  return re.sub(r"\s+", " ", s).strip()
202
 
203
+
204
  def strip_contracts_for_scoring(s: str) -> str:
205
  """
206
  Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
 
212
  s3 = CA_SOL_RE.sub(" ", s2)
213
  return re.sub(r"\s+", " ", s3).strip()
214
 
215
+
216
  def score_relevance(text: str, keywords: List[str]) -> float:
217
  """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
218
  if not text:
 
245
 
246
  return exact_score + fuzzy_score
247
 
248
+
249
  def hash_for_dedup(text: str, msg) -> str:
250
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
251
  parts = [text or ""]
 
262
  raw = "|".join(parts).encode("utf-8", errors="ignore")
263
  return hashlib.sha1(raw).hexdigest()
264
 
265
+
266
  def content_only_hash(text: str) -> str:
267
  """Hash berbasis isi saja (untuk lintas-grup crosspost)."""
268
  norm = _strip_urls_and_mentions(normalize_for_filter(text))
269
  return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  # ========= Class aggregator (windowed unique groups) =========
273
  keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
274
 
275
+
276
  def _prune_expired(now: datetime) -> None:
277
  window = timedelta(minutes=CLASS_WINDOW_MINUTES)
278
  cutoff = now - window
 
286
  # db prune
287
  db_prune_expired(cutoff)
288
 
289
+
290
  def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
291
  if not now:
292
  now = datetime.now(timezone.utc)
 
320
  r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
321
  r"(bit\.ly|tinyurl\.com|linktr\.ee)",
322
  # perluasan: link t.me biasa
323
+ r"t\.me\/[A-Za-z0-9_]+",
324
  ]
325
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
326
 
 
332
  ]
333
  WHITELIST_REGEXES = [re.compile(p, re.IGNORECASE) for p in WHITELIST_STRONG_SIGNAL]
334
 
335
+
336
  def _is_invite_sentence(s: str) -> bool:
337
  t = s.strip()
338
  if not t:
 
343
  # Jika ada 1+ pola ajakan, buang
344
  return any(r.search(t) for r in INVITE_REGEXES)
345
 
346
+
347
  def filter_invite_sentences(text: str) -> str:
348
  if not text:
349
  return text
350
+ parts = re.split(r'(?<=[\.!\?])\s+|\n+', text, flags=re.UNICODE)
351
  kept = [p.strip() for p in parts if p and not _is_invite_sentence(p)]
352
  cleaned = "\n".join(kept).strip()
353
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
 
358
  TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
359
  last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
360
 
361
+
362
  async def _send_initial(msg, text: str) -> int:
363
  if DRY_RUN:
364
  print("[DRY_RUN] send_initial:", text[:140])
 
367
  if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
368
  try:
369
  if getattr(msg, "photo", None):
370
+ m = await client.send_file(
371
+ TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False
372
+ )
373
  return m.id
374
  doc = getattr(msg, "document", None)
375
  if doc:
 
384
  ext_guess = ".jpg"
385
  ext = ext_guess
386
  bio.name = f"media{ext}"
387
+ m = await client.send_file(
388
+ TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False
389
+ )
390
  return m.id
391
  except FloodWaitError as e:
392
  await asyncio.sleep(e.seconds + 1)
 
400
  await asyncio.sleep(e.seconds + 1)
401
  return await _send_initial(msg, text)
402
 
403
+
404
  async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
405
  prefix = f"[{new_tier.upper()}] "
406
  text = prefix + body
 
447
  if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
448
  try:
449
  if getattr(msg, "photo", None):
450
+ await client.send_file(
451
+ TARGET_CHAT, msg.photo, caption=orig_text, caption_entities=entities, force_document=False
452
+ )
453
  return
454
  doc = getattr(msg, "document", None)
455
  if doc:
 
464
  ext_guess = ".jpg"
465
  ext = ext_guess
466
  bio.name = f"media{ext}"
467
+ await client.send_file(
468
+ TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False
469
+ )
470
  return
471
  except FloodWaitError as e:
472
  await asyncio.sleep(e.seconds + 1)
 
484
  TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
485
  TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
486
 
487
+
488
  def _extract_tickers(text_norm: str) -> List[str]:
489
  """
490
  Ambil $TICKER dengan dua cara:
 
514
  seen.add(x)
515
  return uniq
516
 
517
+
518
  def _extract_all_keywords(text_norm: str) -> List[str]:
519
  """
520
  Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
 
541
  seen.add(kw)
542
  return uniq
543
 
544
+
545
  def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
546
  if not kws:
547
  return None
 
556
  chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
557
  return chosen
558
 
559
+
560
  def _role_of(chat_id: int) -> str:
561
  # DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
562
  return chat_roles.get(chat_id, "support")
563
 
564
+
565
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
566
  """
567
  Hitung jumlah grup unik yang menyebut 'keyword' dalam window aktif,
 
575
  return len(core_ids), len(sup_ids)
576
 
577
 
578
+ # ========= NEW: Entity-key extraction (CA > $ticker) =========
579
+ def extract_entity_key(text: str) -> Optional[str]:
580
+ """Kembalikan kunci entitas kanonik untuk penentuan 'kesamaan':
581
+ - Jika ada CA -> 'ca:evm:<0x...>' atau 'ca:sol:<base58>'
582
+ - Else jika ada $ticker -> 'ticker:<lowercase>'
583
+ - Else None
584
+ """
585
+ t = normalize_for_filter(text)
586
+
587
+ # Prefer CA lebih dulu
588
+ m = CA_EVM_RE.search(t) or CA_SOL_RE.search(t)
589
+ if m:
590
+ addr = m.group(0)
591
+ kind = "evm" if addr.lower().startswith("0x") else "sol"
592
+ return f"ca:{kind}:{addr.lower()}"
593
+
594
+ # Fall back ke $ticker (pakai deteksi yang sudah ada)
595
+ tickers = _extract_tickers(t.lower())
596
+ if tickers:
597
+ return f"ticker:{tickers[0][1:].lower()}"
598
+
599
+ return None
600
+
601
+
602
  async def process_message(msg, source_chat_id: int) -> None:
603
  """
604
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
 
613
  debug_log("Dilewati karena EXCLUDE_PHRASES", orig_text)
614
  return
615
 
616
+ # === NEW: entity-based dedup (CA/$ticker) lebih awal ===
617
+ entity_key = extract_entity_key(orig_text)
618
+ if entity_key and entity_key in recent_entity_keys:
619
+ debug_log("Entity-duplicate (CA/TICKER), dilewati", orig_text)
620
+ return
621
+
622
  # Content-only dedup (lintas grup)
623
  ch = content_only_hash(orig_text)
624
  if ch in recent_content_hashes:
 
641
 
642
  role = _role_of(source_chat_id) # 'core' / 'support'
643
 
644
+ # Multi-kw -> pilih satu dominan untuk agregasi (fallback jika tak ada entity)
645
  all_kws = _extract_all_keywords(text_norm)
646
  main_kw = _choose_dominant_keyword(text_norm, all_kws)
647
+
648
+ # === NEW: topic key = entity_key (CA/$ticker) jika ada, else main_kw ===
649
+ topic_key = entity_key or main_kw
650
+ if not topic_key:
651
+ debug_log("Tak ada keyword/entitas cocok, dilewati", orig_text)
652
  return
653
 
654
+ # Agregasi & kelas (berdasar topic_key)
655
  group_key = str(source_chat_id)
656
  now = datetime.now(timezone.utc)
657
+ class_label, unique_groups = update_and_classify(topic_key, group_key, now)
658
 
659
  # Gating SUPPORT (CORE-anchored)
660
  if role != "core":
661
+ core_u, sup_u = _unique_counts_by_role(topic_key)
662
  # Aturan:
663
+ # - Jika sudah ada minimal 1 sebutan dari CORE untuk key ini -> izinkan.
664
  # - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
665
  if core_u >= 1:
666
  pass
667
  elif sup_u < SUPPORT_MIN_UNIQUE:
668
+ debug_log(
669
+ f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})",
670
+ orig_text,
671
+ )
672
  return
673
 
674
  # Filter kalimat ajakan (whitelist-aware)
 
678
  return
679
 
680
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
681
+ cutoff = startup_time_utc - timedelta(
682
+ minutes=CLASS_WINDOW_MINUTES + BACKFILL_BUFFER_MINUTES
683
+ )
684
  if getattr(msg, "date", None):
685
  msg_dt = msg.date
686
  if isinstance(msg_dt, datetime) and msg_dt.replace(tzinfo=timezone.utc) < cutoff:
687
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
688
  return
689
 
690
+ # === NEW: simpan entity-key setelah sukses (untuk dedup) ===
691
+ if entity_key:
692
+ recent_entity_keys.append(entity_key)
693
+
694
+ await post_or_update(topic_key, cleaned_body, class_label, msg)
695
+ debug_log(
696
+ f"Posted/Edited (role={role}, unique_groups={unique_groups}, key={topic_key}, tier={class_label})",
697
+ orig_text,
698
+ )
699
 
700
 
701
  async def backfill_history(entity, limit: int) -> None:
 
730
  print(f"Gagal resolve sumber {src}: {e}")
731
  return resolved
732
 
733
+
734
  async def start_bot_background() -> None:
735
  await client.start()
736
  _init_db()
 
752
  print("Kurator berjalan (background task). Menunggu pesan baru...")
753
  asyncio.create_task(client.run_until_disconnected())
754
 
755
+
756
  async def app_main() -> None:
757
  await client.start()
758
  _init_db()