Spaces:
Sleeping
Sleeping
Update botsignal.py
Browse files- botsignal.py +107 -49
botsignal.py
CHANGED
|
@@ -35,6 +35,7 @@ SUPPORT_CHATS = [
|
|
| 35 |
"https://t.me/TheDonALPHAJournal",
|
| 36 |
"https://t.me/savascalls",
|
| 37 |
"https://t.me/Tanjirocall",
|
|
|
|
| 38 |
"https://t.me/ChapoInsider",
|
| 39 |
"https://t.me/millionsgems",
|
| 40 |
"https://t.me/Milagrosdegencalls",
|
|
@@ -93,6 +94,8 @@ def build_client() -> TelegramClient:
|
|
| 93 |
client = build_client()
|
| 94 |
recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 95 |
recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Peta id_chat -> "core" / "support"
|
| 98 |
chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
|
|
@@ -109,7 +112,8 @@ def _db():
|
|
| 109 |
|
| 110 |
def _init_db():
|
| 111 |
conn = _db()
|
| 112 |
-
conn.executescript(
|
|
|
|
| 113 |
CREATE TABLE IF NOT EXISTS last_posted (
|
| 114 |
keyword TEXT PRIMARY KEY,
|
| 115 |
msg_id INTEGER NOT NULL,
|
|
@@ -121,7 +125,8 @@ def _init_db():
|
|
| 121 |
last_ts INTEGER NOT NULL,
|
| 122 |
PRIMARY KEY (keyword, group_key)
|
| 123 |
);
|
| 124 |
-
"""
|
|
|
|
| 125 |
conn.commit()
|
| 126 |
conn.close()
|
| 127 |
|
|
@@ -139,17 +144,21 @@ def db_load_state():
|
|
| 139 |
|
| 140 |
def db_save_last_posted(keyword: str, msg_id: int, tier: str):
|
| 141 |
conn = _db()
|
| 142 |
-
conn.execute(
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
| 145 |
conn.commit()
|
| 146 |
conn.close()
|
| 147 |
|
| 148 |
def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
|
| 149 |
conn = _db()
|
| 150 |
-
conn.execute(
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
conn.commit()
|
| 154 |
conn.close()
|
| 155 |
|
|
@@ -177,19 +186,21 @@ def _tokenize_words(s: str) -> List[str]:
|
|
| 177 |
|
| 178 |
def _windows(tokens: List[str], size: int = 20):
|
| 179 |
for i in range(0, len(tokens), size):
|
| 180 |
-
yield " ".join(tokens[i:i+size])
|
| 181 |
|
| 182 |
# --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
|
| 183 |
CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
|
| 184 |
-
CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b")
|
| 185 |
CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
|
| 186 |
|
|
|
|
| 187 |
def _strip_urls_and_mentions(s: str) -> str:
|
| 188 |
s = re.sub(r"https?://\S+", "", s)
|
| 189 |
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 190 |
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 191 |
return re.sub(r"\s+", " ", s).strip()
|
| 192 |
|
|
|
|
| 193 |
def strip_contracts_for_scoring(s: str) -> str:
|
| 194 |
"""
|
| 195 |
Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
|
|
@@ -201,6 +212,7 @@ def strip_contracts_for_scoring(s: str) -> str:
|
|
| 201 |
s3 = CA_SOL_RE.sub(" ", s2)
|
| 202 |
return re.sub(r"\s+", " ", s3).strip()
|
| 203 |
|
|
|
|
| 204 |
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 205 |
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 206 |
if not text:
|
|
@@ -233,6 +245,7 @@ def score_relevance(text: str, keywords: List[str]) -> float:
|
|
| 233 |
|
| 234 |
return exact_score + fuzzy_score
|
| 235 |
|
|
|
|
| 236 |
def hash_for_dedup(text: str, msg) -> str:
|
| 237 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 238 |
parts = [text or ""]
|
|
@@ -249,37 +262,17 @@ def hash_for_dedup(text: str, msg) -> str:
|
|
| 249 |
raw = "|".join(parts).encode("utf-8", errors="ignore")
|
| 250 |
return hashlib.sha1(raw).hexdigest()
|
| 251 |
|
|
|
|
| 252 |
def content_only_hash(text: str) -> str:
|
| 253 |
"""Hash berbasis isi saja (untuk lintas-grup crosspost)."""
|
| 254 |
norm = _strip_urls_and_mentions(normalize_for_filter(text))
|
| 255 |
return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
|
| 256 |
|
| 257 |
-
def is_image_message(msg) -> bool:
|
| 258 |
-
if getattr(msg, "photo", None) is not None:
|
| 259 |
-
return True
|
| 260 |
-
doc = getattr(msg, "document", None)
|
| 261 |
-
if doc and getattr(doc, "mime_type", None):
|
| 262 |
-
mt = (doc.mime_type or "").lower()
|
| 263 |
-
if mt.startswith("image/"):
|
| 264 |
-
if SKIP_STICKERS and ("webp" in mt or "sticker" in mt):
|
| 265 |
-
return False
|
| 266 |
-
return True
|
| 267 |
-
if not ALLOW_GIFS_VIDEOS:
|
| 268 |
-
return False
|
| 269 |
-
if mt in ("video/mp4", "image/gif"):
|
| 270 |
-
return True
|
| 271 |
-
return False
|
| 272 |
-
|
| 273 |
-
def media_too_big(msg) -> bool:
|
| 274 |
-
doc = getattr(msg, "document", None)
|
| 275 |
-
if doc and getattr(doc, "size", None):
|
| 276 |
-
return (doc.size or 0) > MAX_MEDIA_MB * 1024 * 1024
|
| 277 |
-
return False
|
| 278 |
-
|
| 279 |
|
| 280 |
# ========= Class aggregator (windowed unique groups) =========
|
| 281 |
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 282 |
|
|
|
|
| 283 |
def _prune_expired(now: datetime) -> None:
|
| 284 |
window = timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 285 |
cutoff = now - window
|
|
@@ -293,6 +286,7 @@ def _prune_expired(now: datetime) -> None:
|
|
| 293 |
# db prune
|
| 294 |
db_prune_expired(cutoff)
|
| 295 |
|
|
|
|
| 296 |
def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
|
| 297 |
if not now:
|
| 298 |
now = datetime.now(timezone.utc)
|
|
@@ -326,7 +320,7 @@ INVITE_PATTERNS = [
|
|
| 326 |
r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
|
| 327 |
r"(bit\.ly|tinyurl\.com|linktr\.ee)",
|
| 328 |
# perluasan: link t.me biasa
|
| 329 |
-
r"t\.me\/[A-Za-z0-9_]+"
|
| 330 |
]
|
| 331 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 332 |
|
|
@@ -338,6 +332,7 @@ WHITELIST_STRONG_SIGNAL = [
|
|
| 338 |
]
|
| 339 |
WHITELIST_REGEXES = [re.compile(p, re.IGNORECASE) for p in WHITELIST_STRONG_SIGNAL]
|
| 340 |
|
|
|
|
| 341 |
def _is_invite_sentence(s: str) -> bool:
|
| 342 |
t = s.strip()
|
| 343 |
if not t:
|
|
@@ -348,10 +343,11 @@ def _is_invite_sentence(s: str) -> bool:
|
|
| 348 |
# Jika ada 1+ pola ajakan, buang
|
| 349 |
return any(r.search(t) for r in INVITE_REGEXES)
|
| 350 |
|
|
|
|
| 351 |
def filter_invite_sentences(text: str) -> str:
|
| 352 |
if not text:
|
| 353 |
return text
|
| 354 |
-
parts = re.split(r'(?<=[
|
| 355 |
kept = [p.strip() for p in parts if p and not _is_invite_sentence(p)]
|
| 356 |
cleaned = "\n".join(kept).strip()
|
| 357 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
|
@@ -362,6 +358,7 @@ def filter_invite_sentences(text: str) -> str:
|
|
| 362 |
TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
|
| 363 |
last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
|
| 364 |
|
|
|
|
| 365 |
async def _send_initial(msg, text: str) -> int:
|
| 366 |
if DRY_RUN:
|
| 367 |
print("[DRY_RUN] send_initial:", text[:140])
|
|
@@ -370,7 +367,9 @@ async def _send_initial(msg, text: str) -> int:
|
|
| 370 |
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 371 |
try:
|
| 372 |
if getattr(msg, "photo", None):
|
| 373 |
-
m = await client.send_file(
|
|
|
|
|
|
|
| 374 |
return m.id
|
| 375 |
doc = getattr(msg, "document", None)
|
| 376 |
if doc:
|
|
@@ -385,7 +384,9 @@ async def _send_initial(msg, text: str) -> int:
|
|
| 385 |
ext_guess = ".jpg"
|
| 386 |
ext = ext_guess
|
| 387 |
bio.name = f"media{ext}"
|
| 388 |
-
m = await client.send_file(
|
|
|
|
|
|
|
| 389 |
return m.id
|
| 390 |
except FloodWaitError as e:
|
| 391 |
await asyncio.sleep(e.seconds + 1)
|
|
@@ -399,6 +400,7 @@ async def _send_initial(msg, text: str) -> int:
|
|
| 399 |
await asyncio.sleep(e.seconds + 1)
|
| 400 |
return await _send_initial(msg, text)
|
| 401 |
|
|
|
|
| 402 |
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 403 |
prefix = f"[{new_tier.upper()}] "
|
| 404 |
text = prefix + body
|
|
@@ -445,7 +447,9 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 445 |
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 446 |
try:
|
| 447 |
if getattr(msg, "photo", None):
|
| 448 |
-
await client.send_file(
|
|
|
|
|
|
|
| 449 |
return
|
| 450 |
doc = getattr(msg, "document", None)
|
| 451 |
if doc:
|
|
@@ -460,7 +464,9 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 460 |
ext_guess = ".jpg"
|
| 461 |
ext = ext_guess
|
| 462 |
bio.name = f"media{ext}"
|
| 463 |
-
await client.send_file(
|
|
|
|
|
|
|
| 464 |
return
|
| 465 |
except FloodWaitError as e:
|
| 466 |
await asyncio.sleep(e.seconds + 1)
|
|
@@ -478,6 +484,7 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
|
|
| 478 |
TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
|
| 479 |
TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
|
| 480 |
|
|
|
|
| 481 |
def _extract_tickers(text_norm: str) -> List[str]:
|
| 482 |
"""
|
| 483 |
Ambil $TICKER dengan dua cara:
|
|
@@ -507,6 +514,7 @@ def _extract_tickers(text_norm: str) -> List[str]:
|
|
| 507 |
seen.add(x)
|
| 508 |
return uniq
|
| 509 |
|
|
|
|
| 510 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 511 |
"""
|
| 512 |
Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
|
|
@@ -533,6 +541,7 @@ def _extract_all_keywords(text_norm: str) -> List[str]:
|
|
| 533 |
seen.add(kw)
|
| 534 |
return uniq
|
| 535 |
|
|
|
|
| 536 |
def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
| 537 |
if not kws:
|
| 538 |
return None
|
|
@@ -547,10 +556,12 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
|
| 547 |
chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
|
| 548 |
return chosen
|
| 549 |
|
|
|
|
| 550 |
def _role_of(chat_id: int) -> str:
|
| 551 |
# DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
|
| 552 |
return chat_roles.get(chat_id, "support")
|
| 553 |
|
|
|
|
| 554 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
| 555 |
"""
|
| 556 |
Hitung jumlah grup unik yang menyebut 'keyword' dalam window aktif,
|
|
@@ -564,6 +575,30 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
|
| 564 |
return len(core_ids), len(sup_ids)
|
| 565 |
|
| 566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
async def process_message(msg, source_chat_id: int) -> None:
|
| 568 |
"""
|
| 569 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
|
@@ -578,6 +613,12 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 578 |
debug_log("Dilewati karena EXCLUDE_PHRASES", orig_text)
|
| 579 |
return
|
| 580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
# Content-only dedup (lintas grup)
|
| 582 |
ch = content_only_hash(orig_text)
|
| 583 |
if ch in recent_content_hashes:
|
|
@@ -600,28 +641,34 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 600 |
|
| 601 |
role = _role_of(source_chat_id) # 'core' / 'support'
|
| 602 |
|
| 603 |
-
# Multi-kw -> pilih satu dominan untuk agregasi
|
| 604 |
all_kws = _extract_all_keywords(text_norm)
|
| 605 |
main_kw = _choose_dominant_keyword(text_norm, all_kws)
|
| 606 |
-
|
| 607 |
-
|
|
|
|
|
|
|
|
|
|
| 608 |
return
|
| 609 |
|
| 610 |
-
# Agregasi & kelas
|
| 611 |
group_key = str(source_chat_id)
|
| 612 |
now = datetime.now(timezone.utc)
|
| 613 |
-
class_label, unique_groups = update_and_classify(
|
| 614 |
|
| 615 |
# Gating SUPPORT (CORE-anchored)
|
| 616 |
if role != "core":
|
| 617 |
-
core_u, sup_u = _unique_counts_by_role(
|
| 618 |
# Aturan:
|
| 619 |
-
# - Jika sudah ada minimal 1 sebutan dari CORE untuk
|
| 620 |
# - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
|
| 621 |
if core_u >= 1:
|
| 622 |
pass
|
| 623 |
elif sup_u < SUPPORT_MIN_UNIQUE:
|
| 624 |
-
debug_log(
|
|
|
|
|
|
|
|
|
|
| 625 |
return
|
| 626 |
|
| 627 |
# Filter kalimat ajakan (whitelist-aware)
|
|
@@ -631,15 +678,24 @@ async def process_message(msg, source_chat_id: int) -> None:
|
|
| 631 |
return
|
| 632 |
|
| 633 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
| 634 |
-
cutoff = startup_time_utc - timedelta(
|
|
|
|
|
|
|
| 635 |
if getattr(msg, "date", None):
|
| 636 |
msg_dt = msg.date
|
| 637 |
if isinstance(msg_dt, datetime) and msg_dt.replace(tzinfo=timezone.utc) < cutoff:
|
| 638 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 639 |
return
|
| 640 |
|
| 641 |
-
|
| 642 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
|
| 645 |
async def backfill_history(entity, limit: int) -> None:
|
|
@@ -674,6 +730,7 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
|
|
| 674 |
print(f"Gagal resolve sumber {src}: {e}")
|
| 675 |
return resolved
|
| 676 |
|
|
|
|
| 677 |
async def start_bot_background() -> None:
|
| 678 |
await client.start()
|
| 679 |
_init_db()
|
|
@@ -695,6 +752,7 @@ async def start_bot_background() -> None:
|
|
| 695 |
print("Kurator berjalan (background task). Menunggu pesan baru...")
|
| 696 |
asyncio.create_task(client.run_until_disconnected())
|
| 697 |
|
|
|
|
| 698 |
async def app_main() -> None:
|
| 699 |
await client.start()
|
| 700 |
_init_db()
|
|
|
|
| 35 |
"https://t.me/TheDonALPHAJournal",
|
| 36 |
"https://t.me/savascalls",
|
| 37 |
"https://t.me/Tanjirocall",
|
| 38 |
+
"https://t.me/Zen_call",
|
| 39 |
"https://t.me/ChapoInsider",
|
| 40 |
"https://t.me/millionsgems",
|
| 41 |
"https://t.me/Milagrosdegencalls",
|
|
|
|
| 94 |
client = build_client()
|
| 95 |
recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 96 |
recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
|
| 97 |
+
# New: entity-based dedup (CA/$ticker)
|
| 98 |
+
recent_entity_keys: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
|
| 99 |
|
| 100 |
# Peta id_chat -> "core" / "support"
|
| 101 |
chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
|
|
|
|
| 112 |
|
| 113 |
def _init_db():
|
| 114 |
conn = _db()
|
| 115 |
+
conn.executescript(
|
| 116 |
+
"""
|
| 117 |
CREATE TABLE IF NOT EXISTS last_posted (
|
| 118 |
keyword TEXT PRIMARY KEY,
|
| 119 |
msg_id INTEGER NOT NULL,
|
|
|
|
| 125 |
last_ts INTEGER NOT NULL,
|
| 126 |
PRIMARY KEY (keyword, group_key)
|
| 127 |
);
|
| 128 |
+
"""
|
| 129 |
+
)
|
| 130 |
conn.commit()
|
| 131 |
conn.close()
|
| 132 |
|
|
|
|
| 144 |
|
| 145 |
def db_save_last_posted(keyword: str, msg_id: int, tier: str):
|
| 146 |
conn = _db()
|
| 147 |
+
conn.execute(
|
| 148 |
+
"INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
|
| 149 |
+
"ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
|
| 150 |
+
(keyword, msg_id, tier),
|
| 151 |
+
)
|
| 152 |
conn.commit()
|
| 153 |
conn.close()
|
| 154 |
|
| 155 |
def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
|
| 156 |
conn = _db()
|
| 157 |
+
conn.execute(
|
| 158 |
+
"INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
|
| 159 |
+
"ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
|
| 160 |
+
(keyword, group_key, int(ts.timestamp())),
|
| 161 |
+
)
|
| 162 |
conn.commit()
|
| 163 |
conn.close()
|
| 164 |
|
|
|
|
| 186 |
|
| 187 |
def _windows(tokens: List[str], size: int = 20):
|
| 188 |
for i in range(0, len(tokens), size):
|
| 189 |
+
yield " ".join(tokens[i : i + size])
|
| 190 |
|
| 191 |
# --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
|
| 192 |
CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
|
| 193 |
+
CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
|
| 194 |
CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
|
| 195 |
|
| 196 |
+
|
| 197 |
def _strip_urls_and_mentions(s: str) -> str:
|
| 198 |
s = re.sub(r"https?://\S+", "", s)
|
| 199 |
s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
|
| 200 |
s = re.sub(r"@[A-Za-z0-9_]+", "", s)
|
| 201 |
return re.sub(r"\s+", " ", s).strip()
|
| 202 |
|
| 203 |
+
|
| 204 |
def strip_contracts_for_scoring(s: str) -> str:
|
| 205 |
"""
|
| 206 |
Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
|
|
|
|
| 212 |
s3 = CA_SOL_RE.sub(" ", s2)
|
| 213 |
return re.sub(r"\s+", " ", s3).strip()
|
| 214 |
|
| 215 |
+
|
| 216 |
def score_relevance(text: str, keywords: List[str]) -> float:
|
| 217 |
"""Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
|
| 218 |
if not text:
|
|
|
|
| 245 |
|
| 246 |
return exact_score + fuzzy_score
|
| 247 |
|
| 248 |
+
|
| 249 |
def hash_for_dedup(text: str, msg) -> str:
|
| 250 |
"""Hash campuran (lama) – menahan duplikat per pesan+media."""
|
| 251 |
parts = [text or ""]
|
|
|
|
| 262 |
raw = "|".join(parts).encode("utf-8", errors="ignore")
|
| 263 |
return hashlib.sha1(raw).hexdigest()
|
| 264 |
|
| 265 |
+
|
| 266 |
def content_only_hash(text: str) -> str:
|
| 267 |
"""Hash berbasis isi saja (untuk lintas-grup crosspost)."""
|
| 268 |
norm = _strip_urls_and_mentions(normalize_for_filter(text))
|
| 269 |
return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
# ========= Class aggregator (windowed unique groups) =========
|
| 273 |
keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
|
| 274 |
|
| 275 |
+
|
| 276 |
def _prune_expired(now: datetime) -> None:
|
| 277 |
window = timedelta(minutes=CLASS_WINDOW_MINUTES)
|
| 278 |
cutoff = now - window
|
|
|
|
| 286 |
# db prune
|
| 287 |
db_prune_expired(cutoff)
|
| 288 |
|
| 289 |
+
|
| 290 |
def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
|
| 291 |
if not now:
|
| 292 |
now = datetime.now(timezone.utc)
|
|
|
|
| 320 |
r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
|
| 321 |
r"(bit\.ly|tinyurl\.com|linktr\.ee)",
|
| 322 |
# perluasan: link t.me biasa
|
| 323 |
+
r"t\.me\/[A-Za-z0-9_]+",
|
| 324 |
]
|
| 325 |
INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
|
| 326 |
|
|
|
|
| 332 |
]
|
| 333 |
WHITELIST_REGEXES = [re.compile(p, re.IGNORECASE) for p in WHITELIST_STRONG_SIGNAL]
|
| 334 |
|
| 335 |
+
|
| 336 |
def _is_invite_sentence(s: str) -> bool:
|
| 337 |
t = s.strip()
|
| 338 |
if not t:
|
|
|
|
| 343 |
# Jika ada 1+ pola ajakan, buang
|
| 344 |
return any(r.search(t) for r in INVITE_REGEXES)
|
| 345 |
|
| 346 |
+
|
| 347 |
def filter_invite_sentences(text: str) -> str:
|
| 348 |
if not text:
|
| 349 |
return text
|
| 350 |
+
parts = re.split(r'(?<=[\.!\?])\s+|\n+', text, flags=re.UNICODE)
|
| 351 |
kept = [p.strip() for p in parts if p and not _is_invite_sentence(p)]
|
| 352 |
cleaned = "\n".join(kept).strip()
|
| 353 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
|
|
|
| 358 |
TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
|
| 359 |
last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
|
| 360 |
|
| 361 |
+
|
| 362 |
async def _send_initial(msg, text: str) -> int:
|
| 363 |
if DRY_RUN:
|
| 364 |
print("[DRY_RUN] send_initial:", text[:140])
|
|
|
|
| 367 |
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 368 |
try:
|
| 369 |
if getattr(msg, "photo", None):
|
| 370 |
+
m = await client.send_file(
|
| 371 |
+
TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False
|
| 372 |
+
)
|
| 373 |
return m.id
|
| 374 |
doc = getattr(msg, "document", None)
|
| 375 |
if doc:
|
|
|
|
| 384 |
ext_guess = ".jpg"
|
| 385 |
ext = ext_guess
|
| 386 |
bio.name = f"media{ext}"
|
| 387 |
+
m = await client.send_file(
|
| 388 |
+
TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False
|
| 389 |
+
)
|
| 390 |
return m.id
|
| 391 |
except FloodWaitError as e:
|
| 392 |
await asyncio.sleep(e.seconds + 1)
|
|
|
|
| 400 |
await asyncio.sleep(e.seconds + 1)
|
| 401 |
return await _send_initial(msg, text)
|
| 402 |
|
| 403 |
+
|
| 404 |
async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
|
| 405 |
prefix = f"[{new_tier.upper()}] "
|
| 406 |
text = prefix + body
|
|
|
|
| 447 |
if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
|
| 448 |
try:
|
| 449 |
if getattr(msg, "photo", None):
|
| 450 |
+
await client.send_file(
|
| 451 |
+
TARGET_CHAT, msg.photo, caption=orig_text, caption_entities=entities, force_document=False
|
| 452 |
+
)
|
| 453 |
return
|
| 454 |
doc = getattr(msg, "document", None)
|
| 455 |
if doc:
|
|
|
|
| 464 |
ext_guess = ".jpg"
|
| 465 |
ext = ext_guess
|
| 466 |
bio.name = f"media{ext}"
|
| 467 |
+
await client.send_file(
|
| 468 |
+
TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False
|
| 469 |
+
)
|
| 470 |
return
|
| 471 |
except FloodWaitError as e:
|
| 472 |
await asyncio.sleep(e.seconds + 1)
|
|
|
|
| 484 |
TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
|
| 485 |
TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
|
| 486 |
|
| 487 |
+
|
| 488 |
def _extract_tickers(text_norm: str) -> List[str]:
|
| 489 |
"""
|
| 490 |
Ambil $TICKER dengan dua cara:
|
|
|
|
| 514 |
seen.add(x)
|
| 515 |
return uniq
|
| 516 |
|
| 517 |
+
|
| 518 |
def _extract_all_keywords(text_norm: str) -> List[str]:
|
| 519 |
"""
|
| 520 |
Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
|
|
|
|
| 541 |
seen.add(kw)
|
| 542 |
return uniq
|
| 543 |
|
| 544 |
+
|
| 545 |
def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
|
| 546 |
if not kws:
|
| 547 |
return None
|
|
|
|
| 556 |
chosen = sorted(score.items(), key=lambda x: (x[1][0], x[1][1], x[1][2]), reverse=True)[0][0]
|
| 557 |
return chosen
|
| 558 |
|
| 559 |
+
|
| 560 |
def _role_of(chat_id: int) -> str:
|
| 561 |
# DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
|
| 562 |
return chat_roles.get(chat_id, "support")
|
| 563 |
|
| 564 |
+
|
| 565 |
def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
|
| 566 |
"""
|
| 567 |
Hitung jumlah grup unik yang menyebut 'keyword' dalam window aktif,
|
|
|
|
| 575 |
return len(core_ids), len(sup_ids)
|
| 576 |
|
| 577 |
|
| 578 |
+
# ========= NEW: Entity-key extraction (CA > $ticker) =========
|
| 579 |
+
def extract_entity_key(text: str) -> Optional[str]:
|
| 580 |
+
"""Kembalikan kunci entitas kanonik untuk penentuan 'kesamaan':
|
| 581 |
+
- Jika ada CA -> 'ca:evm:<0x...>' atau 'ca:sol:<base58>'
|
| 582 |
+
- Else jika ada $ticker -> 'ticker:<lowercase>'
|
| 583 |
+
- Else None
|
| 584 |
+
"""
|
| 585 |
+
t = normalize_for_filter(text)
|
| 586 |
+
|
| 587 |
+
# Prefer CA lebih dulu
|
| 588 |
+
m = CA_EVM_RE.search(t) or CA_SOL_RE.search(t)
|
| 589 |
+
if m:
|
| 590 |
+
addr = m.group(0)
|
| 591 |
+
kind = "evm" if addr.lower().startswith("0x") else "sol"
|
| 592 |
+
return f"ca:{kind}:{addr.lower()}"
|
| 593 |
+
|
| 594 |
+
# Fall back ke $ticker (pakai deteksi yang sudah ada)
|
| 595 |
+
tickers = _extract_tickers(t.lower())
|
| 596 |
+
if tickers:
|
| 597 |
+
return f"ticker:{tickers[0][1:].lower()}"
|
| 598 |
+
|
| 599 |
+
return None
|
| 600 |
+
|
| 601 |
+
|
| 602 |
async def process_message(msg, source_chat_id: int) -> None:
|
| 603 |
"""
|
| 604 |
Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
|
|
|
|
| 613 |
debug_log("Dilewati karena EXCLUDE_PHRASES", orig_text)
|
| 614 |
return
|
| 615 |
|
| 616 |
+
# === NEW: entity-based dedup (CA/$ticker) lebih awal ===
|
| 617 |
+
entity_key = extract_entity_key(orig_text)
|
| 618 |
+
if entity_key and entity_key in recent_entity_keys:
|
| 619 |
+
debug_log("Entity-duplicate (CA/TICKER), dilewati", orig_text)
|
| 620 |
+
return
|
| 621 |
+
|
| 622 |
# Content-only dedup (lintas grup)
|
| 623 |
ch = content_only_hash(orig_text)
|
| 624 |
if ch in recent_content_hashes:
|
|
|
|
| 641 |
|
| 642 |
role = _role_of(source_chat_id) # 'core' / 'support'
|
| 643 |
|
| 644 |
+
# Multi-kw -> pilih satu dominan untuk agregasi (fallback jika tak ada entity)
|
| 645 |
all_kws = _extract_all_keywords(text_norm)
|
| 646 |
main_kw = _choose_dominant_keyword(text_norm, all_kws)
|
| 647 |
+
|
| 648 |
+
# === NEW: topic key = entity_key (CA/$ticker) jika ada, else main_kw ===
|
| 649 |
+
topic_key = entity_key or main_kw
|
| 650 |
+
if not topic_key:
|
| 651 |
+
debug_log("Tak ada keyword/entitas cocok, dilewati", orig_text)
|
| 652 |
return
|
| 653 |
|
| 654 |
+
# Agregasi & kelas (berdasar topic_key)
|
| 655 |
group_key = str(source_chat_id)
|
| 656 |
now = datetime.now(timezone.utc)
|
| 657 |
+
class_label, unique_groups = update_and_classify(topic_key, group_key, now)
|
| 658 |
|
| 659 |
# Gating SUPPORT (CORE-anchored)
|
| 660 |
if role != "core":
|
| 661 |
+
core_u, sup_u = _unique_counts_by_role(topic_key)
|
| 662 |
# Aturan:
|
| 663 |
+
# - Jika sudah ada minimal 1 sebutan dari CORE untuk key ini -> izinkan.
|
| 664 |
# - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
|
| 665 |
if core_u >= 1:
|
| 666 |
pass
|
| 667 |
elif sup_u < SUPPORT_MIN_UNIQUE:
|
| 668 |
+
debug_log(
|
| 669 |
+
f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})",
|
| 670 |
+
orig_text,
|
| 671 |
+
)
|
| 672 |
return
|
| 673 |
|
| 674 |
# Filter kalimat ajakan (whitelist-aware)
|
|
|
|
| 678 |
return
|
| 679 |
|
| 680 |
# Backfill safety: saat startup, hindari pesan yang terlalu lama
|
| 681 |
+
cutoff = startup_time_utc - timedelta(
|
| 682 |
+
minutes=CLASS_WINDOW_MINUTES + BACKFILL_BUFFER_MINUTES
|
| 683 |
+
)
|
| 684 |
if getattr(msg, "date", None):
|
| 685 |
msg_dt = msg.date
|
| 686 |
if isinstance(msg_dt, datetime) and msg_dt.replace(tzinfo=timezone.utc) < cutoff:
|
| 687 |
debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
|
| 688 |
return
|
| 689 |
|
| 690 |
+
# === NEW: simpan entity-key setelah sukses (untuk dedup) ===
|
| 691 |
+
if entity_key:
|
| 692 |
+
recent_entity_keys.append(entity_key)
|
| 693 |
+
|
| 694 |
+
await post_or_update(topic_key, cleaned_body, class_label, msg)
|
| 695 |
+
debug_log(
|
| 696 |
+
f"Posted/Edited (role={role}, unique_groups={unique_groups}, key={topic_key}, tier={class_label})",
|
| 697 |
+
orig_text,
|
| 698 |
+
)
|
| 699 |
|
| 700 |
|
| 701 |
async def backfill_history(entity, limit: int) -> None:
|
|
|
|
| 730 |
print(f"Gagal resolve sumber {src}: {e}")
|
| 731 |
return resolved
|
| 732 |
|
| 733 |
+
|
| 734 |
async def start_bot_background() -> None:
|
| 735 |
await client.start()
|
| 736 |
_init_db()
|
|
|
|
| 752 |
print("Kurator berjalan (background task). Menunggu pesan baru...")
|
| 753 |
asyncio.create_task(client.run_until_disconnected())
|
| 754 |
|
| 755 |
+
|
| 756 |
async def app_main() -> None:
|
| 757 |
await client.start()
|
| 758 |
_init_db()
|