Spaces:
Sleeping
Sleeping
Update botsignal.py
Browse files- botsignal.py +207 -373
botsignal.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
| 3 |
import re
|
|
@@ -14,42 +15,40 @@ from telethon import TelegramClient, events
|
|
| 14 |
from telethon.sessions import StringSession, MemorySession
|
| 15 |
from telethon.errors.rpcerrorlist import FloodWaitError
|
| 16 |
|
| 17 |
-
|
| 18 |
-
# ========= Configuration via Environment =========
|
| 19 |
API_ID = int(os.environ.get("API_ID", "0"))
|
| 20 |
API_HASH = os.environ.get("API_HASH", "")
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
#
|
| 24 |
-
CORE_CHATS = [
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"https://t.me/ChinaPumpCommunity",
|
| 28 |
-
"https://t.me/SephirothGemCalls1",
|
| 29 |
-
"https://t.me/GM_Degencalls",
|
| 30 |
-
"https://t.me/Enthucalls",
|
| 31 |
-
"https://t.me/kobecalls",
|
| 32 |
-
"https://t.me/Kulture_Kall",
|
| 33 |
-
]
|
| 34 |
-
SUPPORT_CHATS = [
|
| 35 |
-
"https://t.me/TheDonALPHAJournal",
|
| 36 |
-
"https://t.me/savascalls",
|
| 37 |
-
"https://t.me/Tanjirocall",
|
| 38 |
-
"https://t.me/ChapoInsider",
|
| 39 |
-
"https://t.me/millionsgems",
|
| 40 |
-
"https://t.me/Milagrosdegencalls",
|
| 41 |
-
"https://t.me/kariusgemscalls",
|
| 42 |
-
"https://t.me/Dwen_Exchange",
|
| 43 |
-
"https://t.me/bat_gamble",
|
| 44 |
-
"https://t.me/BatmanGamble",
|
| 45 |
-
"https://t.me/hulkgemscalls_real",
|
| 46 |
-
"https://t.me/MineGems",
|
| 47 |
-
]
|
| 48 |
SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
# Kata kunci topik + simbol '$' tetap dipakai
|
| 53 |
THEME_KEYWORDS = [
|
| 54 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 55 |
"pump", "spot", "futures", "setup",
|
|
@@ -73,6 +72,7 @@ DEDUP_BUFFER_SIZE = int(os.environ.get("DEDUP_BUFFER_SIZE", "800"))
|
|
| 73 |
|
| 74 |
CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
|
| 75 |
|
|
|
|
| 76 |
SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
|
| 77 |
|
| 78 |
# New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
|
|
@@ -81,30 +81,34 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
# ======
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
print(">> Using MemorySession (login tiap run).")
|
| 91 |
-
return TelegramClient(MemorySession(), API_ID, API_HASH)
|
| 92 |
|
| 93 |
-
client =
|
| 94 |
-
recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 95 |
-
recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # NEW: content-only dedup
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
|
| 99 |
startup_time_utc = datetime.now(timezone.utc)
|
| 100 |
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
#
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def _db():
|
| 106 |
conn = sqlite3.connect(DB_PATH)
|
| 107 |
conn.execute("PRAGMA journal_mode=WAL;")
|
|
|
|
| 108 |
return conn
|
| 109 |
|
| 110 |
def _init_db():
|
|
@@ -137,83 +141,29 @@ def db_load_state():
|
|
| 137 |
conn.close()
|
| 138 |
return last, kw_map
|
| 139 |
|
| 140 |
-
def
|
| 141 |
-
conn = _db()
|
| 142 |
-
conn.execute("INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
|
| 143 |
-
"ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
|
| 144 |
-
(keyword, msg_id, tier))
|
| 145 |
-
conn.commit()
|
| 146 |
-
conn.close()
|
| 147 |
-
|
| 148 |
-
def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
|
| 149 |
conn = _db()
|
| 150 |
-
conn.execute(
|
| 151 |
-
|
| 152 |
-
|
|
|
|
| 153 |
conn.commit()
|
| 154 |
conn.close()
|
| 155 |
|
| 156 |
-
def
|
| 157 |
conn = _db()
|
| 158 |
-
conn.execute(
|
|
|
|
|
|
|
|
|
|
| 159 |
conn.commit()
|
| 160 |
conn.close()
|
| 161 |
|
| 162 |
-
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def normalize_for_filter(text: str) -> str:
|
| 169 |
-
if not text:
|
| 170 |
-
return ""
|
| 171 |
-
s = re.sub(r"(?m)^>.*", "", text)
|
| 172 |
-
s = re.sub(r"\s+", " ", s).strip()
|
| 173 |
-
return s
|
| 174 |
-
|
| 175 |
-
def _tokenize_words(s: str) -> List[str]:
|
| 176 |
-
return re.findall(r"[a-z0-9\$\#]{1,64}", s.lower())
|
| 177 |
-
|
| 178 |
-
def _windows(tokens: List[str], size: int = 20):
|
| 179 |
-
for i in range(0, len(tokens), size):
|
| 180 |
-
yield " ".join(tokens[i:i+size])
|
| 181 |
-
|
| 182 |
-
# --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
|
| 183 |
-
CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
|
| 184 |
-
CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
|
| 185 |
-
CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
|
| 186 |
-
|
| 187 |
-
def _strip_urls_and_mentions(s: str) -> str:
|
| 188 |
-
s = re.sub(r"https?://\S+", "", s)
|
| 189 |
-
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 190 |
-
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 191 |
-
return re.sub(r"\s+", " ", s).strip()
|
| 192 |
-
|
| 193 |
-
def strip_contracts_for_scoring(s: str) -> str:
|
| 194 |
-
"""
|
| 195 |
-
Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
|
| 196 |
-
agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
|
| 197 |
-
"""
|
| 198 |
-
s0 = _strip_urls_and_mentions(s)
|
| 199 |
-
s1 = CA_LABEL_RE.sub(" ", s0)
|
| 200 |
-
s2 = CA_EVM_RE.sub(" ", s1)
|
| 201 |
-
s3 = CA_SOL_RE.sub(" ", s2)
|
| 202 |
-
return re.sub(r"\s+", " ", s3).strip()
|
| 203 |
-
|
| 204 |
-
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 205 |
-
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 206 |
-
if not text:
|
| 207 |
-
return 0.0
|
| 208 |
-
|
| 209 |
-
# Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
|
| 210 |
-
t = strip_contracts_for_scoring(text).lower()
|
| 211 |
-
|
| 212 |
-
# exact hits (unik)
|
| 213 |
-
exact_hits = 0
|
| 214 |
-
for kw in set(keywords):
|
| 215 |
-
if kw in t or re.search(rf"\b{re.escape(kw)}\b", t):
|
| 216 |
-
exact_hits += 1
|
| 217 |
exact_score = exact_hits * KEYWORD_WEIGHT
|
| 218 |
|
| 219 |
# fuzzy windowed: ambil top-3 skor di antara jendela 20 token
|
|
@@ -245,89 +195,53 @@ def hash_for_dedup(text: str, msg) -> str:
|
|
| 245 |
if doc and getattr(doc, "id", None) is not None:
|
| 246 |
parts.append(f"doc:{doc.id}")
|
| 247 |
if getattr(msg, "photo", None) is not None:
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
parts.append(f"photo:{ph_id}")
|
| 252 |
-
raw = "|".join(parts).encode("utf-8", errors="ignore")
|
| 253 |
-
return hashlib.sha1(raw).hexdigest()
|
| 254 |
|
| 255 |
def content_only_hash(text: str) -> str:
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
def is_image_message(msg) -> bool:
|
| 261 |
if getattr(msg, "photo", None) is not None:
|
| 262 |
return True
|
| 263 |
doc = getattr(msg, "document", None)
|
| 264 |
-
if
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
if mt in ("video/mp4", "image/gif"):
|
| 273 |
-
return True
|
| 274 |
-
return False
|
| 275 |
|
| 276 |
def media_too_big(msg) -> bool:
|
| 277 |
doc = getattr(msg, "document", None)
|
| 278 |
-
if
|
| 279 |
-
return
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 285 |
-
|
| 286 |
-
def _prune_expired(now: datetime) -> None:
|
| 287 |
-
window = timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 288 |
-
cutoff = now - window
|
| 289 |
-
# in-memory prune
|
| 290 |
-
for kw, m in list(keyword_group_last_seen.items()):
|
| 291 |
-
for gk, ts in list(m.items()):
|
| 292 |
-
if ts < cutoff:
|
| 293 |
-
del m[gk]
|
| 294 |
-
if not m:
|
| 295 |
-
del keyword_group_last_seen[kw]
|
| 296 |
-
# db prune
|
| 297 |
-
db_prune_expired(cutoff)
|
| 298 |
-
|
| 299 |
-
def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
|
| 300 |
-
if not now:
|
| 301 |
-
now = datetime.now(timezone.utc)
|
| 302 |
-
_prune_expired(now)
|
| 303 |
-
|
| 304 |
-
bucket = keyword_group_last_seen[keyword]
|
| 305 |
-
bucket[group_key] = now
|
| 306 |
-
db_upsert_kw_seen(keyword, group_key, now)
|
| 307 |
-
|
| 308 |
-
unique_groups = len(bucket)
|
| 309 |
-
if unique_groups >= 4:
|
| 310 |
-
return "kuat", unique_groups
|
| 311 |
-
elif unique_groups >= 2:
|
| 312 |
-
return "sedang", unique_groups
|
| 313 |
-
else:
|
| 314 |
-
return "rendah", unique_groups
|
| 315 |
-
|
| 316 |
|
| 317 |
-
# ======
|
| 318 |
INVITE_PATTERNS = [
|
| 319 |
-
r"\bjoin\b", r"\
|
| 320 |
-
r"\
|
| 321 |
-
r"\
|
| 322 |
-
r"\bvip\b", r"\bpremium\b", r"\bberbayar\b", r"\bpaid\b", r"\bexclusive\b",
|
| 323 |
-
r"\bwhitelist\b", r"\bprivate( group| channel)?\b", r"\bmembership?\b",
|
| 324 |
-
r"\bsubscribe\b", r"\blangganan\b",
|
| 325 |
-
r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
|
| 326 |
-
r"(bit\.ly|tinyurl\.com|linktr\.ee)",
|
| 327 |
-
r"From TG channel:",
|
| 328 |
r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
|
| 329 |
-
r"\badvert\b", r"\badvertise\b", r"\badvertisement\b"
|
| 330 |
-
|
| 331 |
]
|
| 332 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 333 |
|
|
@@ -343,7 +257,7 @@ def _is_invite_sentence(s: str) -> bool:
|
|
| 343 |
t = s.strip()
|
| 344 |
if not t:
|
| 345 |
return False
|
| 346 |
-
# Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
|
| 347 |
if any(r.search(t) for r in WHITELIST_REGEXES):
|
| 348 |
return False
|
| 349 |
# Jika ada 1+ pola ajakan, buang
|
|
@@ -358,77 +272,35 @@ def filter_invite_sentences(text: str) -> str:
|
|
| 358 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 359 |
return cleaned
|
| 360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
# ========= Post-on-threshold with EDIT (persisted) =========
|
| 363 |
-
TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
|
| 364 |
-
last_posted: Dict[str, Dict[str, object]] = {}
|
| 365 |
-
|
| 366 |
-
async def _send_initial(msg, text: str) -> int:
|
| 367 |
if DRY_RUN:
|
| 368 |
-
print("[DRY_RUN]
|
| 369 |
-
return -1
|
| 370 |
-
# kirim media bila ada & allowed
|
| 371 |
-
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 372 |
-
try:
|
| 373 |
-
if getattr(msg, "photo", None):
|
| 374 |
-
m = await client.send_file(TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False)
|
| 375 |
-
return m.id
|
| 376 |
-
doc = getattr(msg, "document", None)
|
| 377 |
-
if doc:
|
| 378 |
-
data = await client.download_media(msg, file=bytes)
|
| 379 |
-
if data:
|
| 380 |
-
bio = io.BytesIO(data)
|
| 381 |
-
ext = ".jpg"
|
| 382 |
-
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 383 |
-
if mt:
|
| 384 |
-
ext_guess = guess_extension(mt) or ".jpg"
|
| 385 |
-
if ext_guess == ".jpe":
|
| 386 |
-
ext_guess = ".jpg"
|
| 387 |
-
ext = ext_guess
|
| 388 |
-
bio.name = f"media{ext}"
|
| 389 |
-
m = await client.send_file(TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False)
|
| 390 |
-
return m.id
|
| 391 |
-
except FloodWaitError as e:
|
| 392 |
-
await asyncio.sleep(e.seconds + 1)
|
| 393 |
-
return await _send_initial(msg, text)
|
| 394 |
-
except Exception as e:
|
| 395 |
-
debug_log("Gagal kirim media awal, fallback text", str(e))
|
| 396 |
-
try:
|
| 397 |
-
m = await client.send_message(TARGET_CHAT, text, link_preview=True)
|
| 398 |
-
return m.id
|
| 399 |
-
except FloodWaitError as e:
|
| 400 |
-
await asyncio.sleep(e.seconds + 1)
|
| 401 |
-
return await _send_initial(msg, text)
|
| 402 |
-
|
| 403 |
-
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 404 |
-
prefix = f"[{new_tier.upper()}] "
|
| 405 |
-
text = prefix + body
|
| 406 |
-
prev = last_posted.get(keyword)
|
| 407 |
-
if not prev:
|
| 408 |
-
msg_id = await _send_initial(src_msg, text)
|
| 409 |
-
last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
|
| 410 |
-
if msg_id != -1:
|
| 411 |
-
db_save_last_posted(keyword, msg_id, new_tier)
|
| 412 |
return
|
| 413 |
|
| 414 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
try:
|
| 416 |
-
await client.edit_message(TARGET_CHAT,
|
| 417 |
-
|
| 418 |
-
if prev["msg_id"] != -1:
|
| 419 |
-
db_save_last_posted(keyword, prev["msg_id"], new_tier)
|
| 420 |
-
except FloodWaitError as e:
|
| 421 |
-
await asyncio.sleep(e.seconds + 1)
|
| 422 |
-
await post_or_update(keyword, body, new_tier, src_msg)
|
| 423 |
except Exception as e:
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
db_save_last_posted(keyword, msg_id, new_tier)
|
| 429 |
-
else:
|
| 430 |
-
pass # no-op
|
| 431 |
-
|
| 432 |
|
| 433 |
# ========= Core actions (fallback kept) =========
|
| 434 |
async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
@@ -456,78 +328,32 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 456 |
ext = ".jpg"
|
| 457 |
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 458 |
if mt:
|
| 459 |
-
|
| 460 |
-
if
|
| 461 |
-
|
| 462 |
-
ext = ext_guess
|
| 463 |
bio.name = f"media{ext}"
|
| 464 |
await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
|
| 465 |
return
|
| 466 |
-
except FloodWaitError as
|
| 467 |
-
|
| 468 |
except Exception as e:
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
try:
|
| 472 |
-
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 473 |
-
except FloodWaitError as e:
|
| 474 |
-
await asyncio.sleep(e.seconds + 1)
|
| 475 |
-
await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
|
| 476 |
|
|
|
|
| 477 |
|
| 478 |
-
# ======
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
def _extract_tickers(text_norm: str) -> List[str]:
|
| 483 |
-
"""
|
| 484 |
-
Ambil $TICKER dengan dua cara:
|
| 485 |
-
- Bersih: $ABC, $JBCOIN
|
| 486 |
-
- Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
|
| 487 |
-
(Teks asli tetap dikirim apa adanya.)
|
| 488 |
-
"""
|
| 489 |
-
found = []
|
| 490 |
-
|
| 491 |
-
# bersih
|
| 492 |
-
for m in TICKER_CLEAN_RE.finditer(text_norm):
|
| 493 |
-
found.append(m.group(0).lower())
|
| 494 |
-
|
| 495 |
-
# noisy -> normalisasi internal
|
| 496 |
-
for m in TICKER_NOISY_RE.finditer(text_norm):
|
| 497 |
-
raw = m.group(0)
|
| 498 |
-
norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
|
| 499 |
-
if 3 <= len(norm) <= 13: # termasuk '$'
|
| 500 |
-
found.append(norm.lower())
|
| 501 |
-
|
| 502 |
-
# unik & pertahankan urutan
|
| 503 |
-
seen = set()
|
| 504 |
-
uniq = []
|
| 505 |
-
for x in found:
|
| 506 |
-
if x not in seen:
|
| 507 |
-
uniq.append(x)
|
| 508 |
-
seen.add(x)
|
| 509 |
-
return uniq
|
| 510 |
|
| 511 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 512 |
-
"""
|
| 513 |
-
Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
|
| 514 |
-
Tidak menghapus simbol '$' (sesuai permintaan).
|
| 515 |
-
"""
|
| 516 |
-
# toleran untuk pencarian keyword tema (seperti semula)
|
| 517 |
-
t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
|
| 518 |
-
|
| 519 |
found = []
|
| 520 |
-
for
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
#
|
| 525 |
-
tickers = _extract_tickers(text_norm)
|
| 526 |
-
found.extend(tickers)
|
| 527 |
-
|
| 528 |
-
# unik dengan urutan muncul pertama
|
| 529 |
-
uniq = []
|
| 530 |
seen = set()
|
|
|
|
| 531 |
for kw in found:
|
| 532 |
if kw not in seen:
|
| 533 |
uniq.append(kw)
|
|
@@ -549,7 +375,9 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
|
| 549 |
return chosen
|
| 550 |
|
| 551 |
def _role_of(chat_id: int) -> str:
|
| 552 |
-
|
|
|
|
|
|
|
| 553 |
|
| 554 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
| 555 |
"""
|
|
@@ -559,12 +387,38 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
|
| 559 |
bucket = keyword_group_last_seen.get(keyword, {})
|
| 560 |
core_ids, sup_ids = set(), set()
|
| 561 |
for gk in bucket.keys():
|
| 562 |
-
role = chat_roles.get(int(gk), "
|
| 563 |
(core_ids if role == "core" else sup_ids).add(gk)
|
| 564 |
return len(core_ids), len(sup_ids)
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
"""
|
| 569 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
| 570 |
agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
|
|
@@ -583,12 +437,16 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 583 |
if ch in recent_content_hashes:
|
| 584 |
debug_log("Content-duplicate (global), dilewati", orig_text)
|
| 585 |
return
|
| 586 |
-
recent_content_hashes.
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
if h in recent_hashes:
|
| 591 |
-
debug_log("
|
| 592 |
return
|
| 593 |
recent_hashes.append(h)
|
| 594 |
|
|
@@ -612,22 +470,19 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 612 |
now = datetime.now(timezone.utc)
|
| 613 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 614 |
|
| 615 |
-
# Gating SUPPORT (
|
| 616 |
if role == "support":
|
| 617 |
core_u, sup_u = _unique_counts_by_role(main_kw)
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
elif sup_u < SUPPORT_MIN_UNIQUE:
|
| 624 |
-
debug_log(f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})", orig_text)
|
| 625 |
-
return
|
| 626 |
|
| 627 |
-
# Filter
|
| 628 |
-
cleaned_body = filter_invite_sentences(orig_text)
|
| 629 |
-
if not cleaned_body
|
| 630 |
-
debug_log("
|
| 631 |
return
|
| 632 |
|
| 633 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
|
@@ -638,9 +493,21 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 638 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 639 |
return
|
| 640 |
|
| 641 |
-
|
| 642 |
debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
|
| 643 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
async def backfill_history(entity, limit: int) -> None:
|
| 646 |
if limit <= 0:
|
|
@@ -652,17 +519,6 @@ async def backfill_history(entity, limit: int) -> None:
|
|
| 652 |
except Exception as e:
|
| 653 |
debug_log("Error saat memproses backfill", str(e))
|
| 654 |
|
| 655 |
-
|
| 656 |
-
# ========= Event handlers =========
|
| 657 |
-
@client.on(events.NewMessage(chats=SOURCE_CHATS))
|
| 658 |
-
async def on_new_message(event):
|
| 659 |
-
try:
|
| 660 |
-
await process_message(event.message, source_chat_id=event.chat_id)
|
| 661 |
-
except Exception as e:
|
| 662 |
-
print("Process error:", e)
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
# ========= Entry points =========
|
| 666 |
async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
| 667 |
resolved = []
|
| 668 |
for src in raw_list:
|
|
@@ -671,37 +527,16 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
|
| 671 |
resolved.append(ent)
|
| 672 |
chat_roles[int(ent.id)] = role_label
|
| 673 |
except Exception as e:
|
| 674 |
-
print(f"Gagal resolve
|
| 675 |
return resolved
|
| 676 |
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
_init_db()
|
| 680 |
-
|
| 681 |
-
# Load persisted state
|
| 682 |
-
global last_posted, keyword_group_last_seen
|
| 683 |
-
last_posted, keyword_group_last_seen = db_load_state()
|
| 684 |
-
|
| 685 |
-
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 686 |
-
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 687 |
-
resolved_sources = resolved_core + resolved_support
|
| 688 |
-
|
| 689 |
-
for ent in resolved_sources:
|
| 690 |
-
try:
|
| 691 |
-
await backfill_history(ent, INITIAL_BACKFILL)
|
| 692 |
-
except Exception as e:
|
| 693 |
-
print(f"Backfill gagal untuk {ent}: {e}")
|
| 694 |
-
|
| 695 |
-
print("Kurator berjalan (background task). Menunggu pesan baru...")
|
| 696 |
-
asyncio.create_task(client.run_until_disconnected())
|
| 697 |
-
|
| 698 |
-
async def app_main() -> None:
|
| 699 |
await client.start()
|
| 700 |
_init_db()
|
|
|
|
| 701 |
|
| 702 |
-
|
| 703 |
-
last_posted, keyword_group_last_seen = db_load_state()
|
| 704 |
-
|
| 705 |
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 706 |
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 707 |
resolved_sources = resolved_core + resolved_support
|
|
@@ -712,6 +547,5 @@ async def app_main() -> None:
|
|
| 712 |
print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
|
| 713 |
await client.run_until_disconnected()
|
| 714 |
|
| 715 |
-
|
| 716 |
if __name__ == "__main__":
|
| 717 |
asyncio.run(app_main())
|
|
|
|
| 1 |
+
# botsignal.py (patched)
|
| 2 |
import asyncio
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 15 |
from telethon.sessions import StringSession, MemorySession
|
| 16 |
from telethon.errors.rpcerrorlist import FloodWaitError
|
| 17 |
|
|
|
|
|
|
|
| 18 |
API_ID = int(os.environ.get("API_ID", "0"))
|
| 19 |
API_HASH = os.environ.get("API_HASH", "")
|
| 20 |
+
SESSION = os.environ.get("SESSION", "")
|
| 21 |
+
TARGET_CHAT = os.environ.get("TARGET_CHAT", "") # @username atau chat id
|
| 22 |
+
# Sumber
|
| 23 |
+
CORE_CHATS = [s.strip() for s in os.environ.get("CORE_CHATS", "").split(",") if s.strip()]
|
| 24 |
+
SUPPORT_CHATS = [s.strip() for s in os.environ.get("SUPPORT_CHATS", "").split(",") if s.strip()]
|
| 25 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
|
| 27 |
|
| 28 |
+
DB_PATH = os.environ.get("DB_PATH", "botsignal.db")
|
| 29 |
+
|
| 30 |
+
# ====== Tokenisasi / relevansi ======
|
| 31 |
+
def _tokenize_words(s: str) -> List[str]:
|
| 32 |
+
return re.findall(r"[a-zA-Z0-9\$\#]{2,}", s or "")
|
| 33 |
+
|
| 34 |
+
def normalize_for_filter(s: str) -> str:
|
| 35 |
+
if not s:
|
| 36 |
+
return ""
|
| 37 |
+
t = s
|
| 38 |
+
# netralkan alamat kontrak (sol/eth panjang)
|
| 39 |
+
t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
|
| 40 |
+
# hapus url/mention
|
| 41 |
+
t = _strip_urls_and_mentions(t)
|
| 42 |
+
return t
|
| 43 |
+
|
| 44 |
+
URL_REGEX = re.compile(r"(https?:\/\/\S+)", re.IGNORECASE)
|
| 45 |
+
MENTION_REGEX = re.compile(r"@\w+", re.IGNORECASE)
|
| 46 |
+
|
| 47 |
+
def _strip_urls_and_mentions(s: str) -> str:
|
| 48 |
+
s = URL_REGEX.sub("", s)
|
| 49 |
+
s = MENTION_REGEX.sub("", s)
|
| 50 |
+
return s
|
| 51 |
|
|
|
|
| 52 |
THEME_KEYWORDS = [
|
| 53 |
"call", "signal", "entry", "buy", "sell", "tp", "sl",
|
| 54 |
"pump", "spot", "futures", "setup",
|
|
|
|
| 72 |
|
| 73 |
CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
|
| 74 |
|
| 75 |
+
# Dulu dipakai untuk bypass support-only; sekarang hanya aktif bila ALLOW_SUPPORT_SOLO=1
|
| 76 |
SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
|
| 77 |
|
| 78 |
# New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
|
|
|
|
| 81 |
# Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
|
| 82 |
BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
|
| 83 |
|
| 84 |
+
# >>> Tambahan ENV untuk mode support-only (default OFF, strict core-anchored) <<<
|
| 85 |
+
ALLOW_SUPPORT_SOLO = os.environ.get("ALLOW_SUPPORT_SOLO", "0") == "1"
|
| 86 |
+
SUPPORT_SOLO_MIN_UNIQUE = int(os.environ.get("SUPPORT_SOLO_MIN_UNIQUE", "99"))
|
| 87 |
|
| 88 |
+
# ====== Client ======
|
| 89 |
+
if SESSION:
|
| 90 |
+
session = StringSession(SESSION)
|
| 91 |
+
else:
|
| 92 |
+
session = MemorySession()
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
client = TelegramClient(session, API_ID, API_HASH)
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
# ====== State ======
|
|
|
|
| 97 |
startup_time_utc = datetime.now(timezone.utc)
|
| 98 |
|
| 99 |
+
recent_hashes = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 100 |
+
recent_content_hashes = set()
|
| 101 |
|
| 102 |
+
# chat_roles: chat_id -> "core"/"support"
|
| 103 |
+
chat_roles: Dict[int, str] = {}
|
| 104 |
+
|
| 105 |
+
# agregasi keyword -> group_id -> last_seen_utc
|
| 106 |
+
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 107 |
|
| 108 |
def _db():
|
| 109 |
conn = sqlite3.connect(DB_PATH)
|
| 110 |
conn.execute("PRAGMA journal_mode=WAL;")
|
| 111 |
+
conn.execute("PRAGMA synchronous=NORMAL;")
|
| 112 |
return conn
|
| 113 |
|
| 114 |
def _init_db():
|
|
|
|
| 141 |
conn.close()
|
| 142 |
return last, kw_map
|
| 143 |
|
| 144 |
+
def db_save_group_seen(keyword: str, group_key: str, ts: datetime):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
conn = _db()
|
| 146 |
+
conn.execute(
|
| 147 |
+
"INSERT OR REPLACE INTO kw_group_seen(keyword, group_key, last_ts) VALUES (?, ?, ?)",
|
| 148 |
+
(keyword, group_key, int(ts.timestamp())),
|
| 149 |
+
)
|
| 150 |
conn.commit()
|
| 151 |
conn.close()
|
| 152 |
|
| 153 |
+
def db_save_last_posted(keyword: str, msg_id: int, tier: str):
|
| 154 |
conn = _db()
|
| 155 |
+
conn.execute(
|
| 156 |
+
"INSERT OR REPLACE INTO last_posted(keyword, msg_id, tier) VALUES (?, ?, ?)",
|
| 157 |
+
(keyword, msg_id, tier),
|
| 158 |
+
)
|
| 159 |
conn.commit()
|
| 160 |
conn.close()
|
| 161 |
|
| 162 |
+
def score_relevance(t: str, keywords: List[str]) -> float:
|
| 163 |
+
# exact: jumlah kemunculan kw
|
| 164 |
+
exact_hits = sum(
|
| 165 |
+
len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I)) for kw in keywords
|
| 166 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
exact_score = exact_hits * KEYWORD_WEIGHT
|
| 168 |
|
| 169 |
# fuzzy windowed: ambil top-3 skor di antara jendela 20 token
|
|
|
|
| 195 |
if doc and getattr(doc, "id", None) is not None:
|
| 196 |
parts.append(f"doc:{doc.id}")
|
| 197 |
if getattr(msg, "photo", None) is not None:
|
| 198 |
+
parts.append("has_photo")
|
| 199 |
+
h = hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
|
| 200 |
+
return h
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
def content_only_hash(text: str) -> str:
|
| 203 |
+
t = text or ""
|
| 204 |
+
# netralkan angka/CA/url/mention agar “teks ajakan” sama ke-dedup
|
| 205 |
+
t = re.sub(r"\b\d{2,}\b", "NUM", t)
|
| 206 |
+
t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
|
| 207 |
+
t = _strip_urls_and_mentions(t)
|
| 208 |
+
return hashlib.sha256(t.encode("utf-8")).hexdigest()
|
| 209 |
+
|
| 210 |
+
def _windows(tokens: List[str], k: int):
|
| 211 |
+
for i in range(0, len(tokens), max(1, k // 2)):
|
| 212 |
+
yield " ".join(tokens[i:i + k])
|
| 213 |
+
|
| 214 |
+
# ====== Media helpers ======
|
| 215 |
def is_image_message(msg) -> bool:
|
| 216 |
if getattr(msg, "photo", None) is not None:
|
| 217 |
return True
|
| 218 |
doc = getattr(msg, "document", None)
|
| 219 |
+
if not doc:
|
| 220 |
+
return False
|
| 221 |
+
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 222 |
+
if mt.startswith("image/"):
|
| 223 |
+
return True
|
| 224 |
+
if not ALLOW_GIFS_VIDEOS and (mt.startswith("video/") or mt == "image/gif"):
|
| 225 |
+
return False
|
| 226 |
+
return True
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def media_too_big(msg) -> bool:
|
| 229 |
doc = getattr(msg, "document", None)
|
| 230 |
+
if not doc:
|
| 231 |
+
return False
|
| 232 |
+
size = getattr(doc, "size", None)
|
| 233 |
+
if size is None:
|
| 234 |
+
return False
|
| 235 |
+
return (size / (1024 * 1024)) > MAX_MEDIA_MB
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
# ====== Ajakan filter ======
|
| 238 |
INVITE_PATTERNS = [
|
| 239 |
+
r"\bjoin\b", r"\bdm\b", r"\binbox\b", r"\bpm\b", r"\bvip\b",
|
| 240 |
+
r"\bcontact\b", r"\bpromo\b", r"\bpaid\b", r"@",
|
| 241 |
+
r"t\.me\/", r"telegram\.me\/", r"\blink\b", r"\bklik\b", r"\bhubungi\b",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
|
| 243 |
+
r"\badvert\b", r"\badvertise\b", r"\badvertisement\b"
|
| 244 |
+
|
| 245 |
]
|
| 246 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 247 |
|
|
|
|
| 257 |
t = s.strip()
|
| 258 |
if not t:
|
| 259 |
return False
|
| 260 |
+
# (ASAL) Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
|
| 261 |
if any(r.search(t) for r in WHITELIST_REGEXES):
|
| 262 |
return False
|
| 263 |
# Jika ada 1+ pola ajakan, buang
|
|
|
|
| 272 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 273 |
return cleaned
|
| 274 |
|
| 275 |
+
# ====== Posting ======
|
| 276 |
+
async def post_or_update(keyword: str, body: str, class_label: str, msg):
|
| 277 |
+
"""
|
| 278 |
+
Post baru jika belum ada, jika sudah ada posting untuk keyword tsb → edit.
|
| 279 |
+
"""
|
| 280 |
+
# muat last posted dari DB
|
| 281 |
+
last_posted, _ = db_load_state()
|
| 282 |
+
last = last_posted.get(keyword)
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
if DRY_RUN:
|
| 285 |
+
print(f"[DRY_RUN] ({class_label}) [{keyword}] {body[:160]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
return
|
| 287 |
|
| 288 |
+
if not last:
|
| 289 |
+
# post baru
|
| 290 |
+
sent = await client.send_message(TARGET_CHAT, body)
|
| 291 |
+
db_save_last_posted(keyword, sent.id, class_label)
|
| 292 |
+
return
|
| 293 |
+
else:
|
| 294 |
+
# edit posting lama
|
| 295 |
+
last_msg_id = last["msg_id"]
|
| 296 |
try:
|
| 297 |
+
await client.edit_message(TARGET_CHAT, last_msg_id, body)
|
| 298 |
+
db_save_last_posted(keyword, last_msg_id, class_label)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
except Exception as e:
|
| 300 |
+
print("Edit failed, posting baru:", e)
|
| 301 |
+
sent = await client.send_message(TARGET_CHAT, body)
|
| 302 |
+
db_save_last_posted(keyword, sent.id, class_label)
|
| 303 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
# ========= Core actions (fallback kept) =========
|
| 306 |
async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
|
|
| 328 |
ext = ".jpg"
|
| 329 |
mt = (getattr(doc, "mime_type", "") or "").lower()
|
| 330 |
if mt:
|
| 331 |
+
guessed = guess_extension(mt) or ""
|
| 332 |
+
if guessed:
|
| 333 |
+
ext = guessed
|
|
|
|
| 334 |
bio.name = f"media{ext}"
|
| 335 |
await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
|
| 336 |
return
|
| 337 |
+
except FloodWaitError as fw:
|
| 338 |
+
print("FloodWait:", fw)
|
| 339 |
except Exception as e:
|
| 340 |
+
print("send_file err:", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
+
await client.send_message(TARGET_CHAT, orig_text)
|
| 343 |
|
| 344 |
+
# ====== Keyword extraction / aggregation ======
|
| 345 |
+
TICKER_RE = re.compile(r"\$[a-z0-9]{2,10}", re.IGNORECASE)
|
| 346 |
+
COINWORDS = ["btc","eth","sol","bnb","pepe","doge","meme","spot","futures","pump","entry","buy","sell","tp","sl","setup"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
found = []
|
| 350 |
+
found += [m.group(0).lower() for m in TICKER_RE.finditer(text_norm)]
|
| 351 |
+
for w in COINWORDS:
|
| 352 |
+
if re.search(rf"(^|\W){re.escape(w)}(\W|$)", text_norm, flags=re.I):
|
| 353 |
+
found.append(w)
|
| 354 |
+
# unik, preserve order
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
seen = set()
|
| 356 |
+
uniq = []
|
| 357 |
for kw in found:
|
| 358 |
if kw not in seen:
|
| 359 |
uniq.append(kw)
|
|
|
|
| 375 |
return chosen
|
| 376 |
|
| 377 |
def _role_of(chat_id: int) -> str:
|
| 378 |
+
# DULU: default ke "core" → menyebabkan tembus saat resolve gagal.
|
| 379 |
+
# BARU: default ke "support" (atau treat unknown as support)
|
| 380 |
+
return chat_roles.get(chat_id, "support")
|
| 381 |
|
| 382 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
| 383 |
"""
|
|
|
|
| 387 |
bucket = keyword_group_last_seen.get(keyword, {})
|
| 388 |
core_ids, sup_ids = set(), set()
|
| 389 |
for gk in bucket.keys():
|
| 390 |
+
role = chat_roles.get(int(gk), "support")
|
| 391 |
(core_ids if role == "core" else sup_ids).add(gk)
|
| 392 |
return len(core_ids), len(sup_ids)
|
| 393 |
|
| 394 |
+
def update_and_classify(keyword: str, group_key: str, ts: datetime) -> Tuple[str, int]:
|
| 395 |
+
"""
|
| 396 |
+
Simpan last_seen dan kembalikan label kelas (HOT/SEDANG/BARU) + jumlah grup unik.
|
| 397 |
+
"""
|
| 398 |
+
# purge window kadaluarsa
|
| 399 |
+
cutoff = ts - timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 400 |
+
bucket = keyword_group_last_seen[keyword]
|
| 401 |
+
expired = [g for g, t in bucket.items() if t < cutoff]
|
| 402 |
+
for g in expired:
|
| 403 |
+
del bucket[g]
|
| 404 |
+
|
| 405 |
+
# update
|
| 406 |
+
bucket[group_key] = ts
|
| 407 |
+
db_save_group_seen(keyword, group_key, ts)
|
| 408 |
|
| 409 |
+
# hitung grup unik
|
| 410 |
+
unique_groups = len(bucket)
|
| 411 |
+
|
| 412 |
+
# simple classification
|
| 413 |
+
if unique_groups >= 4:
|
| 414 |
+
label = "🔥 HOT"
|
| 415 |
+
elif unique_groups >= 2:
|
| 416 |
+
label = "☀️ SEDANG"
|
| 417 |
+
else:
|
| 418 |
+
label = "🌱 BARU"
|
| 419 |
+
return label, unique_groups
|
| 420 |
+
|
| 421 |
+
def process_message(msg, source_chat_id: int) -> None:
|
| 422 |
"""
|
| 423 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
| 424 |
agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
|
|
|
|
| 437 |
if ch in recent_content_hashes:
|
| 438 |
debug_log("Content-duplicate (global), dilewati", orig_text)
|
| 439 |
return
|
| 440 |
+
recent_content_hashes.add(ch)
|
| 441 |
+
if len(recent_content_hashes) > DEDUP_BUFFER_SIZE * 2:
|
| 442 |
+
# jaga ukuran set
|
| 443 |
+
recent_content_hashes.clear()
|
| 444 |
+
recent_content_hashes.add(ch)
|
| 445 |
+
|
| 446 |
+
# Hash per pesan (teks+media) untuk guard cepat
|
| 447 |
+
h = hash_for_dedup(orig_text, msg)
|
| 448 |
if h in recent_hashes:
|
| 449 |
+
debug_log("Duplicate (hash recent), dilewati", orig_text)
|
| 450 |
return
|
| 451 |
recent_hashes.append(h)
|
| 452 |
|
|
|
|
| 470 |
now = datetime.now(timezone.utc)
|
| 471 |
class_label, unique_groups = update_and_classify(main_kw, group_key, now)
|
| 472 |
|
| 473 |
+
# Gating SUPPORT (strict core-anchored unless env allows 'solo')
|
| 474 |
if role == "support":
|
| 475 |
core_u, sup_u = _unique_counts_by_role(main_kw)
|
| 476 |
+
if core_u < 1:
|
| 477 |
+
# if solo support not allowed or not strong enough, hold
|
| 478 |
+
if (not ALLOW_SUPPORT_SOLO) or (sup_u < SUPPORT_SOLO_MIN_UNIQUE):
|
| 479 |
+
debug_log(f"Support ditahan (butuh anchor core; core_u={core_u}, sup_u={sup_u})", orig_text)
|
| 480 |
+
return
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
+
# Filter ajakan per-kalimat
|
| 483 |
+
cleaned_body = filter_invite_sentences(orig_text).strip()
|
| 484 |
+
if not cleaned_body:
|
| 485 |
+
debug_log("Habis difilter, kosong → dilewati", orig_text)
|
| 486 |
return
|
| 487 |
|
| 488 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
|
|
|
| 493 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 494 |
return
|
| 495 |
|
| 496 |
+
asyncio.create_task(post_or_update(main_kw, cleaned_body, class_label, msg))
|
| 497 |
debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
|
| 498 |
|
| 499 |
+
# ========= Event handlers =========
|
| 500 |
+
@client.on(events.NewMessage(chats=SOURCE_CHATS))
|
| 501 |
+
async def on_new_message(event):
|
| 502 |
+
try:
|
| 503 |
+
await process_message(event.message, source_chat_id=event.chat_id)
|
| 504 |
+
except Exception as e:
|
| 505 |
+
print("Process error:", e)
|
| 506 |
+
|
| 507 |
+
# ========= Utilities =========
|
| 508 |
+
def debug_log(tag: str, body: str):
|
| 509 |
+
ts = datetime.now().strftime("%H:%M:%S")
|
| 510 |
+
print(f"[{ts}] {tag}: {body[:200].replace(chr(10),' / ')}")
|
| 511 |
|
| 512 |
async def backfill_history(entity, limit: int) -> None:
|
| 513 |
if limit <= 0:
|
|
|
|
| 519 |
except Exception as e:
|
| 520 |
debug_log("Error saat memproses backfill", str(e))
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
| 523 |
resolved = []
|
| 524 |
for src in raw_list:
|
|
|
|
| 527 |
resolved.append(ent)
|
| 528 |
chat_roles[int(ent.id)] = role_label
|
| 529 |
except Exception as e:
|
| 530 |
+
print(f"Gagal resolve {src}: {e}")
|
| 531 |
return resolved
|
| 532 |
|
| 533 |
+
# ========= Entry points =========
|
| 534 |
+
async def app_main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
await client.start()
|
| 536 |
_init_db()
|
| 537 |
+
_ = db_load_state() # pre-load caches
|
| 538 |
|
| 539 |
+
# resolve semua chats dan tag role
|
|
|
|
|
|
|
| 540 |
resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
|
| 541 |
resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
|
| 542 |
resolved_sources = resolved_core + resolved_support
|
|
|
|
| 547 |
print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
|
| 548 |
await client.run_until_disconnected()
|
| 549 |
|
|
|
|
| 550 |
if __name__ == "__main__":
|
| 551 |
asyncio.run(app_main())
|