agus1111 commited on
Commit
5c821c2
·
verified ·
1 Parent(s): 5b6413e

Update botsignal.py

Browse files
Files changed (1) hide show
  1. botsignal.py +84 -22
botsignal.py CHANGED
@@ -49,7 +49,7 @@ SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
49
 
50
  TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
51
 
52
- # Kata kunci topik (biarkan simbol "$" tetap dipakai sesuai kebutuhanmu)
53
  THEME_KEYWORDS = [
54
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
55
  "pump", "spot", "futures", "setup",
@@ -81,6 +81,9 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
 
 
 
84
 
85
  # ========= Client bootstrap =========
86
  def build_client() -> TelegramClient:
@@ -179,11 +182,35 @@ def _windows(tokens: List[str], size: int = 20):
179
  for i in range(0, len(tokens), size):
180
  yield " ".join(tokens[i:i+size])
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def score_relevance(text: str, keywords: List[str]) -> float:
183
  """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
184
  if not text:
185
  return 0.0
186
- t = text.lower()
 
 
187
 
188
  # exact hits (unik)
189
  exact_hits = 0
@@ -198,7 +225,6 @@ def score_relevance(text: str, keywords: List[str]) -> float:
198
  return exact_score
199
  scores = []
200
  for w in _windows(tokens, 20):
201
- # untuk setiap window, ambil skor tertinggi atas semua kw
202
  best = 0.0
203
  for kw in keywords:
204
  sc = fuzz.partial_ratio(kw, w) / 100.0
@@ -210,12 +236,6 @@ def score_relevance(text: str, keywords: List[str]) -> float:
210
 
211
  return exact_score + fuzzy_score
212
 
213
- def _strip_urls_and_mentions(s: str) -> str:
214
- s = re.sub(r"https?://\S+", "", s)
215
- s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
216
- s = re.sub(r"@[A-Za-z0-9_]+", "", s)
217
- return re.sub(r"\s+", " ", s).strip()
218
-
219
  def hash_for_dedup(text: str, msg) -> str:
220
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
221
  parts = [text or ""]
@@ -383,7 +403,7 @@ async def _send_initial(msg, text: str) -> int:
383
  return await _send_initial(msg, text)
384
 
385
  async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
386
- prefix = f"[{new_tier.upper()}] "
387
  text = prefix + body
388
  prev = last_posted.get(keyword)
389
  if not prev:
@@ -460,17 +480,56 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
460
  await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
461
 
462
 
463
- # ========= Keyword extraction =========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def _extract_all_keywords(text_norm: str) -> List[str]:
465
  """
466
- Deteksi SEMUA keyword dari THEME_KEYWORDS (support $ticker).
467
  Tidak menghapus simbol '$' (sesuai permintaan).
468
  """
469
- t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I) # toleran untuk pencarian kw
 
 
470
  found = []
471
  for kw in THEME_KEYWORDS:
472
  if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
473
  found.append(kw.lower())
 
 
 
 
 
474
  # unik dengan urutan muncul pertama
475
  uniq = []
476
  seen = set()
@@ -483,15 +542,15 @@ def _extract_all_keywords(text_norm: str) -> List[str]:
483
  def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
484
  if not kws:
485
  return None
486
- # pilih berdasarkan frekuensi kemunculan + preferensi earliest
487
  score = {}
488
  for kw in kws:
489
  cnt = len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I))
490
  first = re.search(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I)
491
  first_idx = first.start() if first else 1_000_000
492
- score[kw] = (cnt, -first_idx)
493
- # sort: freq desc, earliest asc
494
- chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)[0][0]
495
  return chosen
496
 
497
  def _role_of(chat_id: int) -> str:
@@ -526,7 +585,7 @@ async def process_message(msg, source_chat_id: int) -> None:
526
  return
527
  recent_hashes.append(h)
528
 
529
- # Relevansi
530
  score = score_relevance(text_norm, THEME_KEYWORDS)
531
  debug_log(f"Skor relevansi={score:.2f}", orig_text)
532
  if score < RELEVANCE_THRESHOLD:
@@ -546,10 +605,13 @@ async def process_message(msg, source_chat_id: int) -> None:
546
  now = datetime.now(timezone.utc)
547
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
548
 
549
- # Gating SUPPORT (tetap aturan semula; tidak memaksa CORE presence)
550
- if role == "support" and unique_groups < SUPPORT_MIN_UNIQUE:
551
- debug_log(f"Support ditahan (unique_groups={unique_groups} < {SUPPORT_MIN_UNIQUE})", orig_text)
552
- return
 
 
 
553
 
554
  # Filter kalimat ajakan (whitelist-aware)
555
  cleaned_body = filter_invite_sentences(orig_text)
 
49
 
50
  TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
51
 
52
+ # Kata kunci topik + simbol '$' tetap dipakai
53
  THEME_KEYWORDS = [
54
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
55
  "pump", "spot", "futures", "setup",
 
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
84
+ # Opsional: bypass gating support bila keyword dominan adalah $ticker (default ON)
85
+ SUPPORT_TICKER_BYPASS = os.environ.get("SUPPORT_TICKER_BYPASS", "1") == "1"
86
+
87
 
88
  # ========= Client bootstrap =========
89
  def build_client() -> TelegramClient:
 
182
  for i in range(0, len(tokens), size):
183
  yield " ".join(tokens[i:i+size])
184
 
185
+ # --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
186
+ CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
187
+ CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
188
+ CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
189
+
190
+ def _strip_urls_and_mentions(s: str) -> str:
191
+ s = re.sub(r"https?://\S+", "", s)
192
+ s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
193
+ s = re.sub(r"@[A-Za-z0-9_]+", "", s)
194
+ return re.sub(r"\s+", " ", s).strip()
195
+
196
+ def strip_contracts_for_scoring(s: str) -> str:
197
+ """
198
+ Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
199
+ agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
200
+ """
201
+ s0 = _strip_urls_and_mentions(s)
202
+ s1 = CA_LABEL_RE.sub(" ", s0)
203
+ s2 = CA_EVM_RE.sub(" ", s1)
204
+ s3 = CA_SOL_RE.sub(" ", s2)
205
+ return re.sub(r"\s+", " ", s3).strip()
206
+
207
  def score_relevance(text: str, keywords: List[str]) -> float:
208
  """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
209
  if not text:
210
  return 0.0
211
+
212
+ # Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
213
+ t = strip_contracts_for_scoring(text).lower()
214
 
215
  # exact hits (unik)
216
  exact_hits = 0
 
225
  return exact_score
226
  scores = []
227
  for w in _windows(tokens, 20):
 
228
  best = 0.0
229
  for kw in keywords:
230
  sc = fuzz.partial_ratio(kw, w) / 100.0
 
236
 
237
  return exact_score + fuzzy_score
238
 
 
 
 
 
 
 
239
  def hash_for_dedup(text: str, msg) -> str:
240
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
241
  parts = [text or ""]
 
403
  return await _send_initial(msg, text)
404
 
405
  async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
406
+ prefix = f"[{new_tier.UPPER()}] " if hasattr(new_tier, "UPPER") else f"[{new_tier.upper()}] "
407
  text = prefix + body
408
  prev = last_posted.get(keyword)
409
  if not prev:
 
480
  await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
481
 
482
 
483
+ # ========= Keyword extraction ($ticker-aware) =========
484
+ TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
485
+ TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
486
+
487
+ def _extract_tickers(text_norm: str) -> List[str]:
488
+ """
489
+ Ambil $TICKER dengan dua cara:
490
+ - Bersih: $ABC, $JBCOIN
491
+ - Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
492
+ (Teks asli tetap dikirim apa adanya.)
493
+ """
494
+ found = []
495
+
496
+ # bersih
497
+ for m in TICKER_CLEAN_RE.finditer(text_norm):
498
+ found.append(m.group(0).lower())
499
+
500
+ # noisy -> normalisasi internal
501
+ for m in TICKER_NOISY_RE.finditer(text_norm):
502
+ raw = m.group(0)
503
+ norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
504
+ if 3 <= len(norm) <= 13: # termasuk '$'
505
+ found.append(norm.lower())
506
+
507
+ # unik & pertahankan urutan
508
+ seen = set()
509
+ uniq = []
510
+ for x in found:
511
+ if x not in seen:
512
+ uniq.append(x)
513
+ seen.add(x)
514
+ return uniq
515
+
516
  def _extract_all_keywords(text_norm: str) -> List[str]:
517
  """
518
+ Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
519
  Tidak menghapus simbol '$' (sesuai permintaan).
520
  """
521
+ # toleran untuk pencarian keyword tema (seperti semula)
522
+ t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
523
+
524
  found = []
525
  for kw in THEME_KEYWORDS:
526
  if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
527
  found.append(kw.lower())
528
+
529
+ # gabungkan hasil $ticker
530
+ tickers = _extract_tickers(text_norm)
531
+ found.extend(tickers)
532
+
533
  # unik dengan urutan muncul pertama
534
  uniq = []
535
  seen = set()
 
542
  def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
543
  if not kws:
544
  return None
545
+ # pilih berdasarkan frekuensi kemunculan + preferensi $ticker + posisi paling awal
546
  score = {}
547
  for kw in kws:
548
  cnt = len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I))
549
  first = re.search(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I)
550
  first_idx = first.start() if first else 1_000_000
551
+ bonus = 1 if kw.startswith("$") else 0 # prefer $ticker saat imbang
552
+ score[kw] = (cnt, bonus, -first_idx)
553
+ chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
554
  return chosen
555
 
556
  def _role_of(chat_id: int) -> str:
 
585
  return
586
  recent_hashes.append(h)
587
 
588
+ # Relevansi (pakai teks yang CA/URL-nya dinetralkan)
589
  score = score_relevance(text_norm, THEME_KEYWORDS)
590
  debug_log(f"Skor relevansi={score:.2f}", orig_text)
591
  if score < RELEVANCE_THRESHOLD:
 
605
  now = datetime.now(timezone.utc)
606
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
607
 
608
+ # Gating SUPPORT: izinkan $ticker bila SUPPORT_TICKER_BYPASS aktif
609
+ if role == "support":
610
+ if main_kw.startswith("$") and SUPPORT_TICKER_BYPASS:
611
+ pass
612
+ elif unique_groups < SUPPORT_MIN_UNIQUE:
613
+ debug_log(f"Support ditahan (unique_groups={unique_groups} < {SUPPORT_MIN_UNIQUE})", orig_text)
614
+ return
615
 
616
  # Filter kalimat ajakan (whitelist-aware)
617
  cleaned_body = filter_invite_sentences(orig_text)