Spaces:
Sleeping
Sleeping
Update botsignal.py
Browse files- botsignal.py +84 -22
botsignal.py
CHANGED
|
@@ -49,7 +49,7 @@ SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
|
|
| 49 |
|
| 50 |
TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
|
| 51 |
|
| 52 |
-
# Kata kunci topik
|
| 53 |
THEME_KEYWORDS = [
|
| 54 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 55 |
"pump", "spot", "futures", "setup",
|
|
@@ -81,6 +81,9 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# ========= Client bootstrap =========
|
| 86 |
def build_client() -> TelegramClient:
|
|
@@ -179,11 +182,35 @@ def _windows(tokens: List[str], size: int = 20):
|
|
| 179 |
for i in range(0, len(tokens), size):
|
| 180 |
yield " ".join(tokens[i:i+size])
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 183 |
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 184 |
if not text:
|
| 185 |
return 0.0
|
| 186 |
-
|
|
|
|
|
|
|
| 187 |
|
| 188 |
# exact hits (unik)
|
| 189 |
exact_hits = 0
|
|
@@ -198,7 +225,6 @@ def score_relevance(text: str, keywords: List[str]) -> float:
|
|
| 198 |
return exact_score
|
| 199 |
scores = []
|
| 200 |
for w in _windows(tokens, 20):
|
| 201 |
-
# untuk setiap window, ambil skor tertinggi atas semua kw
|
| 202 |
best = 0.0
|
| 203 |
for kw in keywords:
|
| 204 |
sc = fuzz.partial_ratio(kw, w) / 100.0
|
|
@@ -210,12 +236,6 @@ def score_relevance(text: str, keywords: List[str]) -> float:
|
|
| 210 |
|
| 211 |
return exact_score + fuzzy_score
|
| 212 |
|
| 213 |
-
def _strip_urls_and_mentions(s: str) -> str:
|
| 214 |
-
s = re.sub(r"https?://\S+", "", s)
|
| 215 |
-
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 216 |
-
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 217 |
-
return re.sub(r"\s+", " ", s).strip()
|
| 218 |
-
|
| 219 |
def hash_for_dedup(text: str, msg) -> str:
|
| 220 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 221 |
parts = [text or ""]
|
|
@@ -383,7 +403,7 @@ async def _send_initial(msg, text: str) -> int:
|
|
| 383 |
return await _send_initial(msg, text)
|
| 384 |
|
| 385 |
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 386 |
-
prefix = f"[{new_tier.upper()}] "
|
| 387 |
text = prefix + body
|
| 388 |
prev = last_posted.get(keyword)
|
| 389 |
if not prev:
|
|
@@ -460,17 +480,56 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 460 |
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 461 |
|
| 462 |
|
| 463 |
-
# ========= Keyword extraction =========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 465 |
"""
|
| 466 |
-
Deteksi SEMUA keyword dari THEME_KEYWORDS
|
| 467 |
Tidak menghapus simbol '$' (sesuai permintaan).
|
| 468 |
"""
|
| 469 |
-
|
|
|
|
|
|
|
| 470 |
found = []
|
| 471 |
for kw in THEME_KEYWORDS:
|
| 472 |
if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
|
| 473 |
found.append(kw.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
# unik dengan urutan muncul pertama
|
| 475 |
uniq = []
|
| 476 |
seen = set()
|
|
@@ -483,15 +542,15 @@ def _extract_all_keywords(text_norm: str) -> List[str]:
|
|
| 483 |
def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
| 484 |
if not kws:
|
| 485 |
return None
|
| 486 |
-
# pilih berdasarkan frekuensi kemunculan + preferensi
|
| 487 |
score = {}
|
| 488 |
for kw in kws:
|
| 489 |
cnt = len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I))
|
| 490 |
first = re.search(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I)
|
| 491 |
first_idx = first.start() if first else 1_000_000
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)[0][0]
|
| 495 |
return chosen
|
| 496 |
|
| 497 |
def _role_of(chat_id: int) -> str:
|
|
@@ -526,7 +585,7 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 526 |
return
|
| 527 |
recent_hashes.append(h)
|
| 528 |
|
| 529 |
-
# Relevansi
|
| 530 |
score = score_relevance(text_norm, THEME_KEYWORDS)
|
| 531 |
debug_log(f"Skor relevansi={score:.2f}", orig_text)
|
| 532 |
if score < RELEVANCE_THRESHOLD:
|
|
@@ -546,10 +605,13 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 546 |
now = datetime.now(timezone.utc)
|
| 547 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 548 |
|
| 549 |
-
# Gating SUPPORT
|
| 550 |
-
if role == "support"
|
| 551 |
-
|
| 552 |
-
|
|
|
|
|
|
|
|
|
|
| 553 |
|
| 554 |
# Filter kalimat ajakan (whitelist-aware)
|
| 555 |
cleaned_body = filter_invite_sentences(orig_text)
|
|
|
|
| 49 |
|
| 50 |
TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
|
| 51 |
|
| 52 |
+
# Kata kunci topik + simbol '$' tetap dipakai
|
| 53 |
THEME_KEYWORDS = [
|
| 54 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 55 |
"pump", "spot", "futures", "setup",
|
|
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
| 84 |
+
# Opsional: bypass gating support bila keyword dominan adalah $ticker (default ON)
|
| 85 |
+
SUPPORT_TICKER_BYPASS = os.environ.get("SUPPORT_TICKER_BYPASS", "1") == "1"
|
| 86 |
+
|
| 87 |
|
| 88 |
# ========= Client bootstrap =========
|
| 89 |
def build_client() -> TelegramClient:
|
|
|
|
| 182 |
for i in range(0, len(tokens), size):
|
| 183 |
yield " ".join(tokens[i:i+size])
|
| 184 |
|
| 185 |
+
# --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
|
| 186 |
+
CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
|
| 187 |
+
CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
|
| 188 |
+
CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
|
| 189 |
+
|
| 190 |
+
def _strip_urls_and_mentions(s: str) -> str:
|
| 191 |
+
s = re.sub(r"https?://\S+", "", s)
|
| 192 |
+
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 193 |
+
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 194 |
+
return re.sub(r"\s+", " ", s).strip()
|
| 195 |
+
|
| 196 |
+
def strip_contracts_for_scoring(s: str) -> str:
|
| 197 |
+
"""
|
| 198 |
+
Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
|
| 199 |
+
agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
|
| 200 |
+
"""
|
| 201 |
+
s0 = _strip_urls_and_mentions(s)
|
| 202 |
+
s1 = CA_LABEL_RE.sub(" ", s0)
|
| 203 |
+
s2 = CA_EVM_RE.sub(" ", s1)
|
| 204 |
+
s3 = CA_SOL_RE.sub(" ", s2)
|
| 205 |
+
return re.sub(r"\s+", " ", s3).strip()
|
| 206 |
+
|
| 207 |
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 208 |
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 209 |
if not text:
|
| 210 |
return 0.0
|
| 211 |
+
|
| 212 |
+
# Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
|
| 213 |
+
t = strip_contracts_for_scoring(text).lower()
|
| 214 |
|
| 215 |
# exact hits (unik)
|
| 216 |
exact_hits = 0
|
|
|
|
| 225 |
return exact_score
|
| 226 |
scores = []
|
| 227 |
for w in _windows(tokens, 20):
|
|
|
|
| 228 |
best = 0.0
|
| 229 |
for kw in keywords:
|
| 230 |
sc = fuzz.partial_ratio(kw, w) / 100.0
|
|
|
|
| 236 |
|
| 237 |
return exact_score + fuzzy_score
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
def hash_for_dedup(text: str, msg) -> str:
|
| 240 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 241 |
parts = [text or ""]
|
|
|
|
| 403 |
return await _send_initial(msg, text)
|
| 404 |
|
| 405 |
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 406 |
+
prefix = f"[{new_tier.UPPER()}] " if hasattr(new_tier, "UPPER") else f"[{new_tier.upper()}] "
|
| 407 |
text = prefix + body
|
| 408 |
prev = last_posted.get(keyword)
|
| 409 |
if not prev:
|
|
|
|
| 480 |
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 481 |
|
| 482 |
|
| 483 |
+
# ========= Keyword extraction ($ticker-aware) =========
|
| 484 |
+
TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
|
| 485 |
+
TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
|
| 486 |
+
|
| 487 |
+
def _extract_tickers(text_norm: str) -> List[str]:
|
| 488 |
+
"""
|
| 489 |
+
Ambil $TICKER dengan dua cara:
|
| 490 |
+
- Bersih: $ABC, $JBCOIN
|
| 491 |
+
- Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
|
| 492 |
+
(Teks asli tetap dikirim apa adanya.)
|
| 493 |
+
"""
|
| 494 |
+
found = []
|
| 495 |
+
|
| 496 |
+
# bersih
|
| 497 |
+
for m in TICKER_CLEAN_RE.finditer(text_norm):
|
| 498 |
+
found.append(m.group(0).lower())
|
| 499 |
+
|
| 500 |
+
# noisy -> normalisasi internal
|
| 501 |
+
for m in TICKER_NOISY_RE.finditer(text_norm):
|
| 502 |
+
raw = m.group(0)
|
| 503 |
+
norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
|
| 504 |
+
if 3 <= len(norm) <= 13: # termasuk '$'
|
| 505 |
+
found.append(norm.lower())
|
| 506 |
+
|
| 507 |
+
# unik & pertahankan urutan
|
| 508 |
+
seen = set()
|
| 509 |
+
uniq = []
|
| 510 |
+
for x in found:
|
| 511 |
+
if x not in seen:
|
| 512 |
+
uniq.append(x)
|
| 513 |
+
seen.add(x)
|
| 514 |
+
return uniq
|
| 515 |
+
|
| 516 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 517 |
"""
|
| 518 |
+
Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
|
| 519 |
Tidak menghapus simbol '$' (sesuai permintaan).
|
| 520 |
"""
|
| 521 |
+
# toleran untuk pencarian keyword tema (seperti semula)
|
| 522 |
+
t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
|
| 523 |
+
|
| 524 |
found = []
|
| 525 |
for kw in THEME_KEYWORDS:
|
| 526 |
if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
|
| 527 |
found.append(kw.lower())
|
| 528 |
+
|
| 529 |
+
# gabungkan hasil $ticker
|
| 530 |
+
tickers = _extract_tickers(text_norm)
|
| 531 |
+
found.extend(tickers)
|
| 532 |
+
|
| 533 |
# unik dengan urutan muncul pertama
|
| 534 |
uniq = []
|
| 535 |
seen = set()
|
|
|
|
| 542 |
def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
| 543 |
if not kws:
|
| 544 |
return None
|
| 545 |
+
# pilih berdasarkan frekuensi kemunculan + preferensi $ticker + posisi paling awal
|
| 546 |
score = {}
|
| 547 |
for kw in kws:
|
| 548 |
cnt = len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I))
|
| 549 |
first = re.search(rf"(^|\W){re.escape(kw)}(\W|$)", text_norm, flags=re.I)
|
| 550 |
first_idx = first.start() if first else 1_000_000
|
| 551 |
+
bonus = 1 if kw.startswith("$") else 0 # prefer $ticker saat imbang
|
| 552 |
+
score[kw] = (cnt, bonus, -first_idx)
|
| 553 |
+
chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
|
| 554 |
return chosen
|
| 555 |
|
| 556 |
def _role_of(chat_id: int) -> str:
|
|
|
|
| 585 |
return
|
| 586 |
recent_hashes.append(h)
|
| 587 |
|
| 588 |
+
# Relevansi (pakai teks yang CA/URL-nya dinetralkan)
|
| 589 |
score = score_relevance(text_norm, THEME_KEYWORDS)
|
| 590 |
debug_log(f"Skor relevansi={score:.2f}", orig_text)
|
| 591 |
if score < RELEVANCE_THRESHOLD:
|
|
|
|
| 605 |
now = datetime.now(timezone.utc)
|
| 606 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 607 |
|
| 608 |
+
# Gating SUPPORT: izinkan $ticker bila SUPPORT_TICKER_BYPASS aktif
|
| 609 |
+
if role == "support":
|
| 610 |
+
if main_kw.startswith("$") and SUPPORT_TICKER_BYPASS:
|
| 611 |
+
pass
|
| 612 |
+
elif unique_groups < SUPPORT_MIN_UNIQUE:
|
| 613 |
+
debug_log(f"Support ditahan (unique_groups={unique_groups} < {SUPPORT_MIN_UNIQUE})", orig_text)
|
| 614 |
+
return
|
| 615 |
|
| 616 |
# Filter kalimat ajakan (whitelist-aware)
|
| 617 |
cleaned_body = filter_invite_sentences(orig_text)
|