Spaces:
Running
Running
Update botsignal.py
Browse files- botsignal.py +376 -210
botsignal.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# botsignal.py (patched)
|
| 2 |
import asyncio
|
| 3 |
import os
|
| 4 |
import re
|
|
@@ -15,40 +14,42 @@ from telethon import TelegramClient, events
|
|
| 15 |
from telethon.sessions import StringSession, MemorySession
|
| 16 |
from telethon.errors.rpcerrorlist import FloodWaitError
|
| 17 |
|
|
|
|
|
|
|
| 18 |
API_ID = int(os.environ.get("API_ID", "0"))
|
| 19 |
API_HASH = os.environ.get("API_HASH", "")
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
#
|
| 23 |
-
CORE_CHATS = [
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
# ====== Tokenisasi / relevansi ======
|
| 31 |
-
def _tokenize_words(s: str) -> List[str]:
|
| 32 |
-
return re.findall(r"[a-zA-Z0-9\$\#]{2,}", s or "")
|
| 33 |
-
|
| 34 |
-
def normalize_for_filter(s: str) -> str:
|
| 35 |
-
if not s:
|
| 36 |
-
return ""
|
| 37 |
-
t = s
|
| 38 |
-
# netralkan alamat kontrak (sol/eth panjang)
|
| 39 |
-
t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
|
| 40 |
-
# hapus url/mention
|
| 41 |
-
t = _strip_urls_and_mentions(t)
|
| 42 |
-
return t
|
| 43 |
-
|
| 44 |
-
URL_REGEX = re.compile(r"(https?:\/\/\S+)", re.IGNORECASE)
|
| 45 |
-
MENTION_REGEX = re.compile(r"@\w+", re.IGNORECASE)
|
| 46 |
-
|
| 47 |
-
def _strip_urls_and_mentions(s: str) -> str:
|
| 48 |
-
s = URL_REGEX.sub("", s)
|
| 49 |
-
s = MENTION_REGEX.sub("", s)
|
| 50 |
-
return s
|
| 51 |
|
|
|
|
| 52 |
THEME_KEYWORDS = [
|
| 53 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 54 |
"pump", "spot", "futures", "setup",
|
|
@@ -72,7 +73,6 @@ DEDUP_BUFFER_SIZE = int(os.environ.get("DEDUP_BUFFER_SIZE", "800"))
|
|
| 72 |
|
| 73 |
CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
|
| 74 |
|
| 75 |
-
# Dulu dipakai untuk bypass support-only; sekarang hanya aktif bila ALLOW_SUPPORT_SOLO=1
|
| 76 |
SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
|
| 77 |
|
| 78 |
# New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
|
|
@@ -81,34 +81,30 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
| 84 |
-
# >>> Tambahan ENV untuk mode support-only (default OFF, strict core-anchored) <<<
|
| 85 |
-
ALLOW_SUPPORT_SOLO = os.environ.get("ALLOW_SUPPORT_SOLO", "0") == "1"
|
| 86 |
-
SUPPORT_SOLO_MIN_UNIQUE = int(os.environ.get("SUPPORT_SOLO_MIN_UNIQUE", "99"))
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
client =
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
#
|
|
|
|
| 97 |
startup_time_utc = datetime.now(timezone.utc)
|
| 98 |
|
| 99 |
-
recent_hashes = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 100 |
-
recent_content_hashes = set()
|
| 101 |
|
| 102 |
-
#
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
# agregasi keyword -> group_id -> last_seen_utc
|
| 106 |
-
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 107 |
|
| 108 |
def _db():
|
| 109 |
conn = sqlite3.connect(DB_PATH)
|
| 110 |
conn.execute("PRAGMA journal_mode=WAL;")
|
| 111 |
-
conn.execute("PRAGMA synchronous=NORMAL;")
|
| 112 |
return conn
|
| 113 |
|
| 114 |
def _init_db():
|
|
@@ -141,29 +137,83 @@ def db_load_state():
|
|
| 141 |
conn.close()
|
| 142 |
return last, kw_map
|
| 143 |
|
| 144 |
-
def
|
| 145 |
conn = _db()
|
| 146 |
-
conn.execute(
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
)
|
| 150 |
conn.commit()
|
| 151 |
conn.close()
|
| 152 |
|
| 153 |
-
def
|
| 154 |
conn = _db()
|
| 155 |
-
conn.execute(
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
)
|
| 159 |
conn.commit()
|
| 160 |
conn.close()
|
| 161 |
|
| 162 |
-
def
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
exact_score = exact_hits * KEYWORD_WEIGHT
|
| 168 |
|
| 169 |
# fuzzy windowed: ambil top-3 skor di antara jendela 20 token
|
|
@@ -183,9 +233,6 @@ def score_relevance(t: str, keywords: List[str]) -> float:
|
|
| 183 |
|
| 184 |
return exact_score + fuzzy_score
|
| 185 |
|
| 186 |
-
def _strip_urls_mentions_only(s: str) -> str:
|
| 187 |
-
return _strip_urls_and_mentions(s)
|
| 188 |
-
|
| 189 |
def hash_for_dedup(text: str, msg) -> str:
|
| 190 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 191 |
parts = [text or ""]
|
|
@@ -195,53 +242,91 @@ def hash_for_dedup(text: str, msg) -> str:
|
|
| 195 |
if doc and getattr(doc, "id", None) is not None:
|
| 196 |
parts.append(f"doc:{doc.id}")
|
| 197 |
if getattr(msg, "photo", None) is not None:
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
def content_only_hash(text: str) -> str:
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
t = _strip_urls_and_mentions(t)
|
| 208 |
-
return hashlib.sha256(t.encode("utf-8")).hexdigest()
|
| 209 |
-
|
| 210 |
-
def _windows(tokens: List[str], k: int):
|
| 211 |
-
for i in range(0, len(tokens), max(1, k // 2)):
|
| 212 |
-
yield " ".join(tokens[i:i + k])
|
| 213 |
-
|
| 214 |
-
# ====== Media helpers ======
|
| 215 |
def is_image_message(msg) -> bool:
|
| 216 |
if getattr(msg, "photo", None) is not None:
|
| 217 |
return True
|
| 218 |
doc = getattr(msg, "document", None)
|
| 219 |
-
if
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def media_too_big(msg) -> bool:
|
| 229 |
doc = getattr(msg, "document", None)
|
| 230 |
-
if
|
| 231 |
-
return
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
|
|
|
| 238 |
INVITE_PATTERNS = [
|
| 239 |
-
r"\bjoin\b", r"\
|
| 240 |
-
r"\
|
| 241 |
-
r"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
|
| 243 |
-
r"\badvert\b", r"\badvertise\b", r"\badvertisement\b"
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
]
|
| 246 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 247 |
|
|
@@ -257,7 +342,7 @@ def _is_invite_sentence(s: str) -> bool:
|
|
| 257 |
t = s.strip()
|
| 258 |
if not t:
|
| 259 |
return False
|
| 260 |
-
#
|
| 261 |
if any(r.search(t) for r in WHITELIST_REGEXES):
|
| 262 |
return False
|
| 263 |
# Jika ada 1+ pola ajakan, buang
|
|
@@ -272,35 +357,77 @@ def filter_invite_sentences(text: str) -> str:
|
|
| 272 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 273 |
return cleaned
|
| 274 |
|
| 275 |
-
# ====== Posting ======
|
| 276 |
-
async def post_or_update(keyword: str, body: str, class_label: str, msg):
|
| 277 |
-
"""
|
| 278 |
-
Post baru jika belum ada, jika sudah ada posting untuk keyword tsb → edit.
|
| 279 |
-
"""
|
| 280 |
-
# muat last posted dari DB
|
| 281 |
-
last_posted, _ = db_load_state()
|
| 282 |
-
last = last_posted.get(keyword)
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
if DRY_RUN:
|
| 285 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
return
|
| 287 |
|
| 288 |
-
if
|
| 289 |
-
# post baru
|
| 290 |
-
sent = await client.send_message(TARGET_CHAT, body)
|
| 291 |
-
db_save_last_posted(keyword, sent.id, class_label)
|
| 292 |
-
return
|
| 293 |
-
else:
|
| 294 |
-
# edit posting lama
|
| 295 |
-
last_msg_id = last["msg_id"]
|
| 296 |
try:
|
| 297 |
-
await client.edit_message(TARGET_CHAT,
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
except Exception as e:
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
# ========= Core actions (fallback kept) =========
|
| 306 |
async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
@@ -328,32 +455,78 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 328 |
ext = ".jpg"
|
| 329 |
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 330 |
if mt:
|
| 331 |
-
|
| 332 |
-
if
|
| 333 |
-
|
|
|
|
| 334 |
bio.name = f"media{ext}"
|
| 335 |
await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
|
| 336 |
return
|
| 337 |
-
except FloodWaitError as
|
| 338 |
-
|
| 339 |
except Exception as e:
|
| 340 |
-
|
| 341 |
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
-
# ====== Keyword extraction / aggregation ======
|
| 345 |
-
TICKER_RE = re.compile(r"\$[a-z0-9]{2,10}", re.IGNORECASE)
|
| 346 |
-
COINWORDS = ["btc","eth","sol","bnb","pepe","doge","meme","spot","futures","pump","entry","buy","sell","tp","sl","setup"]
|
| 347 |
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
found = []
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
seen = set()
|
| 356 |
uniq = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
for kw in found:
|
| 358 |
if kw not in seen:
|
| 359 |
uniq.append(kw)
|
|
@@ -375,8 +548,7 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
|
| 375 |
return chosen
|
| 376 |
|
| 377 |
def _role_of(chat_id: int) -> str:
|
| 378 |
-
#
|
| 379 |
-
# BARU: default ke "support" (atau treat unknown as support)
|
| 380 |
return chat_roles.get(chat_id, "support")
|
| 381 |
|
| 382 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
|
@@ -387,38 +559,12 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
|
| 387 |
bucket = keyword_group_last_seen.get(keyword, {})
|
| 388 |
core_ids, sup_ids = set(), set()
|
| 389 |
for gk in bucket.keys():
|
| 390 |
-
role = chat_roles.get(int(gk), "support")
|
| 391 |
(core_ids if role == "core" else sup_ids).add(gk)
|
| 392 |
return len(core_ids), len(sup_ids)
|
| 393 |
|
| 394 |
-
def update_and_classify(keyword: str, group_key: str, ts: datetime) -> Tuple[str, int]:
|
| 395 |
-
"""
|
| 396 |
-
Simpan last_seen dan kembalikan label kelas (HOT/SEDANG/BARU) + jumlah grup unik.
|
| 397 |
-
"""
|
| 398 |
-
# purge window kadaluarsa
|
| 399 |
-
cutoff = ts - timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 400 |
-
bucket = keyword_group_last_seen[keyword]
|
| 401 |
-
expired = [g for g, t in bucket.items() if t < cutoff]
|
| 402 |
-
for g in expired:
|
| 403 |
-
del bucket[g]
|
| 404 |
-
|
| 405 |
-
# update
|
| 406 |
-
bucket[group_key] = ts
|
| 407 |
-
db_save_group_seen(keyword, group_key, ts)
|
| 408 |
-
|
| 409 |
-
# hitung grup unik
|
| 410 |
-
unique_groups = len(bucket)
|
| 411 |
|
| 412 |
-
|
| 413 |
-
if unique_groups >= 4:
|
| 414 |
-
label = "🔥 HOT"
|
| 415 |
-
elif unique_groups >= 2:
|
| 416 |
-
label = "☀️ SEDANG"
|
| 417 |
-
else:
|
| 418 |
-
label = "🌱 BARU"
|
| 419 |
-
return label, unique_groups
|
| 420 |
-
|
| 421 |
-
def process_message(msg, source_chat_id: int) -> None:
|
| 422 |
"""
|
| 423 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
| 424 |
agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
|
|
@@ -437,16 +583,12 @@ def process_message(msg, source_chat_id: int) -> None:
|
|
| 437 |
if ch in recent_content_hashes:
|
| 438 |
debug_log("Content-duplicate (global), dilewati", orig_text)
|
| 439 |
return
|
| 440 |
-
recent_content_hashes.
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
recent_content_hashes.add(ch)
|
| 445 |
-
|
| 446 |
-
# Hash per pesan (teks+media) untuk guard cepat
|
| 447 |
-
h = hash_for_dedup(orig_text, msg)
|
| 448 |
if h in recent_hashes:
|
| 449 |
-
debug_log("
|
| 450 |
return
|
| 451 |
recent_hashes.append(h)
|
| 452 |
|
|
@@ -470,19 +612,22 @@ def process_message(msg, source_chat_id: int) -> None:
|
|
| 470 |
now = datetime.now(timezone.utc)
|
| 471 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 472 |
|
| 473 |
-
# Gating SUPPORT (
|
| 474 |
-
if role
|
| 475 |
core_u, sup_u = _unique_counts_by_role(main_kw)
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
-
# Filter ajakan
|
| 483 |
-
cleaned_body = filter_invite_sentences(orig_text)
|
| 484 |
-
if not cleaned_body:
|
| 485 |
-
debug_log("
|
| 486 |
return
|
| 487 |
|
| 488 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
|
@@ -493,21 +638,9 @@ def process_message(msg, source_chat_id: int) -> None:
|
|
| 493 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 494 |
return
|
| 495 |
|
| 496 |
-
|
| 497 |
debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
|
| 498 |
|
| 499 |
-
# ========= Event handlers =========
|
| 500 |
-
@client.on(events.NewMessage(chats=SOURCE_CHATS))
|
| 501 |
-
async def on_new_message(event):
|
| 502 |
-
try:
|
| 503 |
-
await process_message(event.message, source_chat_id=event.chat_id)
|
| 504 |
-
except Exception as e:
|
| 505 |
-
print("Process error:", e)
|
| 506 |
-
|
| 507 |
-
# ========= Utilities =========
|
| 508 |
-
def debug_log(tag: str, body: str):
|
| 509 |
-
ts = datetime.now().strftime("%H:%M:%S")
|
| 510 |
-
print(f"[{ts}] {tag}: {body[:200].replace(chr(10),' / ')}")
|
| 511 |
|
| 512 |
async def backfill_history(entity, limit: int) -> None:
|
| 513 |
if limit <= 0:
|
|
@@ -519,6 +652,17 @@ async def backfill_history(entity, limit: int) -> None:
|
|
| 519 |
except Exception as e:
|
| 520 |
debug_log("Error saat memproses backfill", str(e))
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
| 523 |
resolved = []
|
| 524 |
for src in raw_list:
|
|
@@ -527,16 +671,37 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
|
| 527 |
resolved.append(ent)
|
| 528 |
chat_roles[int(ent.id)] = role_label
|
| 529 |
except Exception as e:
|
| 530 |
-
print(f"Gagal resolve {src}: {e}")
|
| 531 |
return resolved
|
| 532 |
|
| 533 |
-
|
| 534 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
await client.start()
|
| 536 |
_init_db()
|
| 537 |
-
_ = db_load_state() # pre-load caches
|
| 538 |
|
| 539 |
-
|
|
|
|
|
|
|
| 540 |
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 541 |
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 542 |
resolved_sources = resolved_core + resolved_support
|
|
@@ -547,5 +712,6 @@ async def app_main():
|
|
| 547 |
print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
|
| 548 |
await client.run_until_disconnected()
|
| 549 |
|
|
|
|
| 550 |
if __name__ == "__main__":
|
| 551 |
asyncio.run(app_main())
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
| 3 |
import re
|
|
|
|
| 14 |
from telethon.sessions import StringSession, MemorySession
|
| 15 |
from telethon.errors.rpcerrorlist import FloodWaitError
|
| 16 |
|
| 17 |
+
|
| 18 |
+
# ========= Configuration via Environment =========
|
| 19 |
API_ID = int(os.environ.get("API_ID", "0"))
|
| 20 |
API_HASH = os.environ.get("API_HASH", "")
|
| 21 |
+
STRING_SESSION = os.environ.get("STRING_SESSION", "")
|
| 22 |
+
|
| 23 |
+
# --- Definisikan sumber sebagai CORE vs SUPPORT (pakai data milikmu) ---
|
| 24 |
+
CORE_CHATS = [
|
| 25 |
+
"https://t.me/PEPE_Calls28",
|
| 26 |
+
"https://t.me/HenryGems",
|
| 27 |
+
"https://t.me/ChinaPumpCommunity",
|
| 28 |
+
"https://t.me/SephirothGemCalls1",
|
| 29 |
+
"https://t.me/GM_Degencalls",
|
| 30 |
+
"https://t.me/Enthucalls",
|
| 31 |
+
"https://t.me/kobecalls",
|
| 32 |
+
"https://t.me/Kulture_Kall",
|
| 33 |
+
]
|
| 34 |
+
SUPPORT_CHATS = [
|
| 35 |
+
"https://t.me/TheDonALPHAJournal",
|
| 36 |
+
"https://t.me/savascalls",
|
| 37 |
+
"https://t.me/Tanjirocall",
|
| 38 |
+
"https://t.me/ChapoInsider",
|
| 39 |
+
"https://t.me/millionsgems",
|
| 40 |
+
"https://t.me/Milagrosdegencalls",
|
| 41 |
+
"https://t.me/kariusgemscalls",
|
| 42 |
+
"https://t.me/Dwen_Exchange",
|
| 43 |
+
"https://t.me/bat_gamble",
|
| 44 |
+
"https://t.me/BatmanGamble",
|
| 45 |
+
"https://t.me/hulkgemscalls_real",
|
| 46 |
+
"https://t.me/MineGems",
|
| 47 |
+
]
|
| 48 |
SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
|
| 49 |
|
| 50 |
+
TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
# Kata kunci topik + simbol '$' tetap dipakai
|
| 53 |
THEME_KEYWORDS = [
|
| 54 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 55 |
"pump", "spot", "futures", "setup",
|
|
|
|
| 73 |
|
| 74 |
CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
|
| 75 |
|
|
|
|
| 76 |
SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
|
| 77 |
|
| 78 |
# New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
|
|
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# ========= Client bootstrap =========
|
| 86 |
+
def build_client() -> TelegramClient:
|
| 87 |
+
if STRING_SESSION:
|
| 88 |
+
print(">> Using StringSession (persistent).")
|
| 89 |
+
return TelegramClient(StringSession(STRING_SESSION), API_ID, API_HASH)
|
| 90 |
+
print(">> Using MemorySession (login tiap run).")
|
| 91 |
+
return TelegramClient(MemorySession(), API_ID, API_HASH)
|
| 92 |
|
| 93 |
+
client = build_client()
|
| 94 |
+
recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 95 |
+
recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
|
| 96 |
|
| 97 |
+
# Peta id_chat -> "core" / "support"
|
| 98 |
+
chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
|
| 99 |
startup_time_utc = datetime.now(timezone.utc)
|
| 100 |
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
# ========= Persistence (SQLite) =========
|
| 103 |
+
DB_PATH = os.environ.get("BOTSIGNAL_DB", "/tmp/botsignal.db")
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def _db():
|
| 106 |
conn = sqlite3.connect(DB_PATH)
|
| 107 |
conn.execute("PRAGMA journal_mode=WAL;")
|
|
|
|
| 108 |
return conn
|
| 109 |
|
| 110 |
def _init_db():
|
|
|
|
| 137 |
conn.close()
|
| 138 |
return last, kw_map
|
| 139 |
|
| 140 |
+
def db_save_last_posted(keyword: str, msg_id: int, tier: str):
|
| 141 |
conn = _db()
|
| 142 |
+
conn.execute("INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
|
| 143 |
+
"ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
|
| 144 |
+
(keyword, msg_id, tier))
|
|
|
|
| 145 |
conn.commit()
|
| 146 |
conn.close()
|
| 147 |
|
| 148 |
+
def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
|
| 149 |
conn = _db()
|
| 150 |
+
conn.execute("INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
|
| 151 |
+
"ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
|
| 152 |
+
(keyword, group_key, int(ts.timestamp())))
|
|
|
|
| 153 |
conn.commit()
|
| 154 |
conn.close()
|
| 155 |
|
| 156 |
+
def db_prune_expired(cutoff: datetime):
|
| 157 |
+
conn = _db()
|
| 158 |
+
conn.execute("DELETE FROM kw_group_seen WHERE last_ts < ?", (int(cutoff.timestamp()),))
|
| 159 |
+
conn.commit()
|
| 160 |
+
conn.close()
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ========= Utilities =========
|
| 164 |
+
def debug_log(reason: str, content: str = "") -> None:
|
| 165 |
+
short = (content or "").replace("\n", " ")[:160]
|
| 166 |
+
print(f"[DEBUG] {reason}: {short}")
|
| 167 |
+
|
| 168 |
+
def normalize_for_filter(text: str) -> str:
|
| 169 |
+
if not text:
|
| 170 |
+
return ""
|
| 171 |
+
s = re.sub(r"(?m)^>.*", "", text)
|
| 172 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 173 |
+
return s
|
| 174 |
+
|
| 175 |
+
def _tokenize_words(s: str) -> List[str]:
|
| 176 |
+
return re.findall(r"[a-z0-9\$\#]{1,64}", s.lower())
|
| 177 |
+
|
| 178 |
+
def _windows(tokens: List[str], size: int = 20):
|
| 179 |
+
for i in range(0, len(tokens), size):
|
| 180 |
+
yield " ".join(tokens[i:i+size])
|
| 181 |
+
|
| 182 |
+
# --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
|
| 183 |
+
CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
|
| 184 |
+
CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
|
| 185 |
+
CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
|
| 186 |
+
|
| 187 |
+
def _strip_urls_and_mentions(s: str) -> str:
|
| 188 |
+
s = re.sub(r"https?://\S+", "", s)
|
| 189 |
+
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 190 |
+
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 191 |
+
return re.sub(r"\s+", " ", s).strip()
|
| 192 |
+
|
| 193 |
+
def strip_contracts_for_scoring(s: str) -> str:
|
| 194 |
+
"""
|
| 195 |
+
Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
|
| 196 |
+
agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
|
| 197 |
+
"""
|
| 198 |
+
s0 = _strip_urls_and_mentions(s)
|
| 199 |
+
s1 = CA_LABEL_RE.sub(" ", s0)
|
| 200 |
+
s2 = CA_EVM_RE.sub(" ", s1)
|
| 201 |
+
s3 = CA_SOL_RE.sub(" ", s2)
|
| 202 |
+
return re.sub(r"\s+", " ", s3).strip()
|
| 203 |
+
|
| 204 |
+
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 205 |
+
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 206 |
+
if not text:
|
| 207 |
+
return 0.0
|
| 208 |
+
|
| 209 |
+
# Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
|
| 210 |
+
t = strip_contracts_for_scoring(text).lower()
|
| 211 |
+
|
| 212 |
+
# exact hits (unik)
|
| 213 |
+
exact_hits = 0
|
| 214 |
+
for kw in set(keywords):
|
| 215 |
+
if kw in t or re.search(rf"\b{re.escape(kw)}\b", t):
|
| 216 |
+
exact_hits += 1
|
| 217 |
exact_score = exact_hits * KEYWORD_WEIGHT
|
| 218 |
|
| 219 |
# fuzzy windowed: ambil top-3 skor di antara jendela 20 token
|
|
|
|
| 233 |
|
| 234 |
return exact_score + fuzzy_score
|
| 235 |
|
|
|
|
|
|
|
|
|
|
| 236 |
def hash_for_dedup(text: str, msg) -> str:
|
| 237 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 238 |
parts = [text or ""]
|
|
|
|
| 242 |
if doc and getattr(doc, "id", None) is not None:
|
| 243 |
parts.append(f"doc:{doc.id}")
|
| 244 |
if getattr(msg, "photo", None) is not None:
|
| 245 |
+
ph = msg.photo
|
| 246 |
+
ph_id = getattr(ph, "id", None)
|
| 247 |
+
if ph_id is not None:
|
| 248 |
+
parts.append(f"photo:{ph_id}")
|
| 249 |
+
raw = "|".join(parts).encode("utf-8", errors="ignore")
|
| 250 |
+
return hashlib.sha1(raw).hexdigest()
|
| 251 |
|
| 252 |
def content_only_hash(text: str) -> str:
|
| 253 |
+
"""Hash berbasis isi saja (untuk lintas-grup crosspost)."""
|
| 254 |
+
norm = _strip_urls_and_mentions(normalize_for_filter(text))
|
| 255 |
+
return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
|
| 256 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
def is_image_message(msg) -> bool:
|
| 258 |
if getattr(msg, "photo", None) is not None:
|
| 259 |
return True
|
| 260 |
doc = getattr(msg, "document", None)
|
| 261 |
+
if doc and getattr(doc, "mime_type", None):
|
| 262 |
+
mt = (doc.mime_type or "").lower()
|
| 263 |
+
if mt.startswith("image/"):
|
| 264 |
+
if SKIP_STICKERS and ("webp" in mt or "sticker" in mt):
|
| 265 |
+
return False
|
| 266 |
+
return True
|
| 267 |
+
if not ALLOW_GIFS_VIDEOS:
|
| 268 |
+
return False
|
| 269 |
+
if mt in ("video/mp4", "image/gif"):
|
| 270 |
+
return True
|
| 271 |
+
return False
|
| 272 |
|
| 273 |
def media_too_big(msg) -> bool:
|
| 274 |
doc = getattr(msg, "document", None)
|
| 275 |
+
if doc and getattr(doc, "size", None):
|
| 276 |
+
return (doc.size or 0) > MAX_MEDIA_MB * 1024 * 1024
|
| 277 |
+
return False
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
# ========= Class aggregator (windowed unique groups) =========
|
| 281 |
+
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 282 |
+
|
| 283 |
+
def _prune_expired(now: datetime) -> None:
|
| 284 |
+
window = timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 285 |
+
cutoff = now - window
|
| 286 |
+
# in-memory prune
|
| 287 |
+
for kw, m in list(keyword_group_last_seen.items()):
|
| 288 |
+
for gk, ts in list(m.items()):
|
| 289 |
+
if ts < cutoff:
|
| 290 |
+
del m[gk]
|
| 291 |
+
if not m:
|
| 292 |
+
del keyword_group_last_seen[kw]
|
| 293 |
+
# db prune
|
| 294 |
+
db_prune_expired(cutoff)
|
| 295 |
+
|
| 296 |
+
def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
|
| 297 |
+
if not now:
|
| 298 |
+
now = datetime.now(timezone.utc)
|
| 299 |
+
_prune_expired(now)
|
| 300 |
+
|
| 301 |
+
bucket = keyword_group_last_seen[keyword]
|
| 302 |
+
bucket[group_key] = now
|
| 303 |
+
db_upsert_kw_seen(keyword, group_key, now)
|
| 304 |
+
|
| 305 |
+
unique_groups = len(bucket)
|
| 306 |
+
if unique_groups >= 4:
|
| 307 |
+
return "kuat", unique_groups
|
| 308 |
+
elif unique_groups >= 2:
|
| 309 |
+
return "sedang", unique_groups
|
| 310 |
+
else:
|
| 311 |
+
return "rendah", unique_groups
|
| 312 |
|
| 313 |
+
|
| 314 |
+
# ========= Sentence-level invite filter (smarter) =========
|
| 315 |
INVITE_PATTERNS = [
|
| 316 |
+
r"\bjoin\b", r"\bjoin (us|our|channel|group)\b",
|
| 317 |
+
r"\bdm\b", r"\bdm (me|gw|gue|gua|saya|admin)\b",
|
| 318 |
+
r"\bpm\b", r"\binbox\b", r"\bcontact\b", r"\bkontak\b", r"\bhubungi\b",
|
| 319 |
+
r"\bvip\b", r"\bpremium\b", r"\bberbayar\b", r"\bpaid\b", r"\bexclusive\b",
|
| 320 |
+
r"\bwhitelist\b", r"\bprivate( group| channel)?\b", r"\bmembership?\b",
|
| 321 |
+
r"\bsubscribe\b", r"\blangganan\b",
|
| 322 |
+
# kata kunci promo/iklan
|
| 323 |
r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
|
| 324 |
+
r"\badvert\b", r"\badvertise\b", r"\badvertisement\b",
|
| 325 |
+
# tautan undangan/shortener
|
| 326 |
+
r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
|
| 327 |
+
r"(bit\.ly|tinyurl\.com|linktr\.ee)",
|
| 328 |
+
# perluasan: link t.me biasa
|
| 329 |
+
r"t\.me\/[A-Za-z0-9_]+"
|
| 330 |
]
|
| 331 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 332 |
|
|
|
|
| 342 |
t = s.strip()
|
| 343 |
if not t:
|
| 344 |
return False
|
| 345 |
+
# Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
|
| 346 |
if any(r.search(t) for r in WHITELIST_REGEXES):
|
| 347 |
return False
|
| 348 |
# Jika ada 1+ pola ajakan, buang
|
|
|
|
| 357 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 358 |
return cleaned
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
+
# ========= Post-on-threshold with EDIT (persisted) =========
|
| 362 |
+
TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
|
| 363 |
+
last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
|
| 364 |
+
|
| 365 |
+
async def _send_initial(msg, text: str) -> int:
|
| 366 |
if DRY_RUN:
|
| 367 |
+
print("[DRY_RUN] send_initial:", text[:140])
|
| 368 |
+
return -1
|
| 369 |
+
# kirim media bila ada & allowed
|
| 370 |
+
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 371 |
+
try:
|
| 372 |
+
if getattr(msg, "photo", None):
|
| 373 |
+
m = await client.send_file(TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False)
|
| 374 |
+
return m.id
|
| 375 |
+
doc = getattr(msg, "document", None)
|
| 376 |
+
if doc:
|
| 377 |
+
data = await client.download_media(msg, file=bytes)
|
| 378 |
+
if data:
|
| 379 |
+
bio = io.BytesIO(data)
|
| 380 |
+
ext = ".jpg"
|
| 381 |
+
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 382 |
+
if mt:
|
| 383 |
+
ext_guess = guess_extension(mt) or ".jpg"
|
| 384 |
+
if ext_guess == ".jpe":
|
| 385 |
+
ext_guess = ".jpg"
|
| 386 |
+
ext = ext_guess
|
| 387 |
+
bio.name = f"media{ext}"
|
| 388 |
+
m = await client.send_file(TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False)
|
| 389 |
+
return m.id
|
| 390 |
+
except FloodWaitError as e:
|
| 391 |
+
await asyncio.sleep(e.seconds + 1)
|
| 392 |
+
return await _send_initial(msg, text)
|
| 393 |
+
except Exception as e:
|
| 394 |
+
debug_log("Gagal kirim media awal, fallback text", str(e))
|
| 395 |
+
try:
|
| 396 |
+
m = await client.send_message(TARGET_CHAT, text, link_preview=True)
|
| 397 |
+
return m.id
|
| 398 |
+
except FloodWaitError as e:
|
| 399 |
+
await asyncio.sleep(e.seconds + 1)
|
| 400 |
+
return await _send_initial(msg, text)
|
| 401 |
+
|
| 402 |
+
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 403 |
+
prefix = f"[{new_tier.upper()}] "
|
| 404 |
+
text = prefix + body
|
| 405 |
+
prev = last_posted.get(keyword)
|
| 406 |
+
if not prev:
|
| 407 |
+
msg_id = await _send_initial(src_msg, text)
|
| 408 |
+
last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
|
| 409 |
+
if msg_id != -1:
|
| 410 |
+
db_save_last_posted(keyword, msg_id, new_tier)
|
| 411 |
return
|
| 412 |
|
| 413 |
+
if TIER_ORDER.get(new_tier, 0) > TIER_ORDER.get(prev["tier"], 0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
try:
|
| 415 |
+
await client.edit_message(TARGET_CHAT, prev["msg_id"], text)
|
| 416 |
+
prev["tier"] = new_tier
|
| 417 |
+
if prev["msg_id"] != -1:
|
| 418 |
+
db_save_last_posted(keyword, prev["msg_id"], new_tier)
|
| 419 |
+
except FloodWaitError as e:
|
| 420 |
+
await asyncio.sleep(e.seconds + 1)
|
| 421 |
+
await post_or_update(keyword, body, new_tier, src_msg)
|
| 422 |
except Exception as e:
|
| 423 |
+
debug_log("Edit gagal, fallback kirim baru", str(e))
|
| 424 |
+
msg_id = await _send_initial(src_msg, text)
|
| 425 |
+
last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
|
| 426 |
+
if msg_id != -1:
|
| 427 |
+
db_save_last_posted(keyword, msg_id, new_tier)
|
| 428 |
+
else:
|
| 429 |
+
pass # no-op
|
| 430 |
+
|
| 431 |
|
| 432 |
# ========= Core actions (fallback kept) =========
|
| 433 |
async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
|
|
| 455 |
ext = ".jpg"
|
| 456 |
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 457 |
if mt:
|
| 458 |
+
ext_guess = guess_extension(mt) or ".jpg"
|
| 459 |
+
if ext_guess == ".jpe":
|
| 460 |
+
ext_guess = ".jpg"
|
| 461 |
+
ext = ext_guess
|
| 462 |
bio.name = f"media{ext}"
|
| 463 |
await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
|
| 464 |
return
|
| 465 |
+
except FloodWaitError as e:
|
| 466 |
+
await asyncio.sleep(e.seconds + 1)
|
| 467 |
except Exception as e:
|
| 468 |
+
debug_log("Gagal kirim sebagai media, fallback ke text", str(e))
|
| 469 |
|
| 470 |
+
try:
|
| 471 |
+
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 472 |
+
except FloodWaitError as e:
|
| 473 |
+
await asyncio.sleep(e.seconds + 1)
|
| 474 |
+
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 475 |
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
+
# ========= Keyword extraction ($ticker-aware) =========
|
| 478 |
+
TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
|
| 479 |
+
TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
|
| 480 |
+
|
| 481 |
+
def _extract_tickers(text_norm: str) -> List[str]:
|
| 482 |
+
"""
|
| 483 |
+
Ambil $TICKER dengan dua cara:
|
| 484 |
+
- Bersih: $ABC, $JBCOIN
|
| 485 |
+
- Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
|
| 486 |
+
(Teks asli tetap dikirim apa adanya.)
|
| 487 |
+
"""
|
| 488 |
found = []
|
| 489 |
+
|
| 490 |
+
# bersih
|
| 491 |
+
for m in TICKER_CLEAN_RE.finditer(text_norm):
|
| 492 |
+
found.append(m.group(0).lower())
|
| 493 |
+
|
| 494 |
+
# noisy -> normalisasi internal
|
| 495 |
+
for m in TICKER_NOISY_RE.finditer(text_norm):
|
| 496 |
+
raw = m.group(0)
|
| 497 |
+
norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
|
| 498 |
+
if 3 <= len(norm) <= 13: # termasuk '$'
|
| 499 |
+
found.append(norm.lower())
|
| 500 |
+
|
| 501 |
+
# unik & pertahankan urutan
|
| 502 |
seen = set()
|
| 503 |
uniq = []
|
| 504 |
+
for x in found:
|
| 505 |
+
if x not in seen:
|
| 506 |
+
uniq.append(x)
|
| 507 |
+
seen.add(x)
|
| 508 |
+
return uniq
|
| 509 |
+
|
| 510 |
+
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 511 |
+
"""
|
| 512 |
+
Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
|
| 513 |
+
Tidak menghapus simbol '$' (sesuai permintaan).
|
| 514 |
+
"""
|
| 515 |
+
# toleran untuk pencarian keyword tema (seperti semula)
|
| 516 |
+
t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
|
| 517 |
+
|
| 518 |
+
found = []
|
| 519 |
+
for kw in THEME_KEYWORDS:
|
| 520 |
+
if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
|
| 521 |
+
found.append(kw.lower())
|
| 522 |
+
|
| 523 |
+
# gabungkan hasil $ticker
|
| 524 |
+
tickers = _extract_tickers(text_norm)
|
| 525 |
+
found.extend(tickers)
|
| 526 |
+
|
| 527 |
+
# unik dengan urutan muncul pertama
|
| 528 |
+
uniq = []
|
| 529 |
+
seen = set()
|
| 530 |
for kw in found:
|
| 531 |
if kw not in seen:
|
| 532 |
uniq.append(kw)
|
|
|
|
| 548 |
return chosen
|
| 549 |
|
| 550 |
def _role_of(chat_id: int) -> str:
|
| 551 |
+
# DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
|
|
|
|
| 552 |
return chat_roles.get(chat_id, "support")
|
| 553 |
|
| 554 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
|
|
|
| 559 |
bucket = keyword_group_last_seen.get(keyword, {})
|
| 560 |
core_ids, sup_ids = set(), set()
|
| 561 |
for gk in bucket.keys():
|
| 562 |
+
role = chat_roles.get(int(gk), "support") # default support untuk aman
|
| 563 |
(core_ids if role == "core" else sup_ids).add(gk)
|
| 564 |
return len(core_ids), len(sup_ids)
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
+
async def process_message(msg, source_chat_id: int) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
"""
|
| 569 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
| 570 |
agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
|
|
|
|
| 583 |
if ch in recent_content_hashes:
|
| 584 |
debug_log("Content-duplicate (global), dilewati", orig_text)
|
| 585 |
return
|
| 586 |
+
recent_content_hashes.append(ch)
|
| 587 |
+
|
| 588 |
+
# Dedup lama (per pesan/media)
|
| 589 |
+
h = hash_for_dedup(text_norm, msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
if h in recent_hashes:
|
| 591 |
+
debug_log("Duplikat (pesan/media), dilewati", orig_text)
|
| 592 |
return
|
| 593 |
recent_hashes.append(h)
|
| 594 |
|
|
|
|
| 612 |
now = datetime.now(timezone.utc)
|
| 613 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 614 |
|
| 615 |
+
# Gating SUPPORT (CORE-anchored)
|
| 616 |
+
if role != "core":
|
| 617 |
core_u, sup_u = _unique_counts_by_role(main_kw)
|
| 618 |
+
# Aturan:
|
| 619 |
+
# - Jika sudah ada minimal 1 sebutan dari CORE untuk keyword ini -> izinkan.
|
| 620 |
+
# - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
|
| 621 |
+
if core_u >= 1:
|
| 622 |
+
pass
|
| 623 |
+
elif sup_u < SUPPORT_MIN_UNIQUE:
|
| 624 |
+
debug_log(f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})", orig_text)
|
| 625 |
+
return
|
| 626 |
|
| 627 |
+
# Filter kalimat ajakan (whitelist-aware)
|
| 628 |
+
cleaned_body = filter_invite_sentences(orig_text)
|
| 629 |
+
if not cleaned_body.strip():
|
| 630 |
+
debug_log("Semua kalimat terfilter (kosong), dilewati", orig_text)
|
| 631 |
return
|
| 632 |
|
| 633 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
|
|
|
| 638 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 639 |
return
|
| 640 |
|
| 641 |
+
await post_or_update(main_kw, cleaned_body, class_label, msg)
|
| 642 |
debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
|
| 643 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
async def backfill_history(entity, limit: int) -> None:
|
| 646 |
if limit <= 0:
|
|
|
|
| 652 |
except Exception as e:
|
| 653 |
debug_log("Error saat memproses backfill", str(e))
|
| 654 |
|
| 655 |
+
|
| 656 |
+
# ========= Event handlers =========
|
| 657 |
+
@client.on(events.NewMessage(chats=SOURCE_CHATS))
|
| 658 |
+
async def on_new_message(event):
|
| 659 |
+
try:
|
| 660 |
+
await process_message(event.message, source_chat_id=event.chat_id)
|
| 661 |
+
except Exception as e:
|
| 662 |
+
print("Process error:", e)
|
| 663 |
+
|
| 664 |
+
|
| 665 |
+
# ========= Entry points =========
|
| 666 |
async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
| 667 |
resolved = []
|
| 668 |
for src in raw_list:
|
|
|
|
| 671 |
resolved.append(ent)
|
| 672 |
chat_roles[int(ent.id)] = role_label
|
| 673 |
except Exception as e:
|
| 674 |
+
print(f"Gagal resolve sumber {src}: {e}")
|
| 675 |
return resolved
|
| 676 |
|
| 677 |
+
async def start_bot_background() -> None:
|
| 678 |
+
await client.start()
|
| 679 |
+
_init_db()
|
| 680 |
+
|
| 681 |
+
# Load persisted state
|
| 682 |
+
global last_posted, keyword_group_last_seen
|
| 683 |
+
last_posted, keyword_group_last_seen = db_load_state()
|
| 684 |
+
|
| 685 |
+
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 686 |
+
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 687 |
+
resolved_sources = resolved_core + resolved_support
|
| 688 |
+
|
| 689 |
+
for ent in resolved_sources:
|
| 690 |
+
try:
|
| 691 |
+
await backfill_history(ent, INITIAL_BACKFILL)
|
| 692 |
+
except Exception as e:
|
| 693 |
+
print(f"Backfill gagal untuk {ent}: {e}")
|
| 694 |
+
|
| 695 |
+
print("Kurator berjalan (background task). Menunggu pesan baru...")
|
| 696 |
+
asyncio.create_task(client.run_until_disconnected())
|
| 697 |
+
|
| 698 |
+
async def app_main() -> None:
|
| 699 |
await client.start()
|
| 700 |
_init_db()
|
|
|
|
| 701 |
|
| 702 |
+
global last_posted, keyword_group_last_seen
|
| 703 |
+
last_posted, keyword_group_last_seen = db_load_state()
|
| 704 |
+
|
| 705 |
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 706 |
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 707 |
resolved_sources = resolved_core + resolved_support
|
|
|
|
| 712 |
print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
|
| 713 |
await client.run_until_disconnected()
|
| 714 |
|
| 715 |
+
|
| 716 |
if __name__ == "__main__":
|
| 717 |
asyncio.run(app_main())
|