agus1111 commited on
Commit
b1fcbd1
·
verified ·
1 Parent(s): 8d523f6

Update botsignal.py

Browse files
Files changed (1) hide show
  1. botsignal.py +376 -210
botsignal.py CHANGED
@@ -1,4 +1,3 @@
1
- # botsignal.py (patched)
2
  import asyncio
3
  import os
4
  import re
@@ -15,40 +14,42 @@ from telethon import TelegramClient, events
15
  from telethon.sessions import StringSession, MemorySession
16
  from telethon.errors.rpcerrorlist import FloodWaitError
17
 
 
 
18
  API_ID = int(os.environ.get("API_ID", "0"))
19
  API_HASH = os.environ.get("API_HASH", "")
20
- SESSION = os.environ.get("SESSION", "")
21
- TARGET_CHAT = os.environ.get("TARGET_CHAT", "") # @username atau chat id
22
- # Sumber
23
- CORE_CHATS = [s.strip() for s in os.environ.get("CORE_CHATS", "").split(",") if s.strip()]
24
- SUPPORT_CHATS = [s.strip() for s in os.environ.get("SUPPORT_CHATS", "").split(",") if s.strip()]
25
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
27
 
28
- DB_PATH = os.environ.get("DB_PATH", "botsignal.db")
29
-
30
- # ====== Tokenisasi / relevansi ======
31
- def _tokenize_words(s: str) -> List[str]:
32
- return re.findall(r"[a-zA-Z0-9\$\#]{2,}", s or "")
33
-
34
- def normalize_for_filter(s: str) -> str:
35
- if not s:
36
- return ""
37
- t = s
38
- # netralkan alamat kontrak (sol/eth panjang)
39
- t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
40
- # hapus url/mention
41
- t = _strip_urls_and_mentions(t)
42
- return t
43
-
44
- URL_REGEX = re.compile(r"(https?:\/\/\S+)", re.IGNORECASE)
45
- MENTION_REGEX = re.compile(r"@\w+", re.IGNORECASE)
46
-
47
- def _strip_urls_and_mentions(s: str) -> str:
48
- s = URL_REGEX.sub("", s)
49
- s = MENTION_REGEX.sub("", s)
50
- return s
51
 
 
52
  THEME_KEYWORDS = [
53
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
54
  "pump", "spot", "futures", "setup",
@@ -72,7 +73,6 @@ DEDUP_BUFFER_SIZE = int(os.environ.get("DEDUP_BUFFER_SIZE", "800"))
72
 
73
  CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
74
 
75
- # Dulu dipakai untuk bypass support-only; sekarang hanya aktif bila ALLOW_SUPPORT_SOLO=1
76
  SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
77
 
78
  # New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
@@ -81,34 +81,30 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
84
- # >>> Tambahan ENV untuk mode support-only (default OFF, strict core-anchored) <<<
85
- ALLOW_SUPPORT_SOLO = os.environ.get("ALLOW_SUPPORT_SOLO", "0") == "1"
86
- SUPPORT_SOLO_MIN_UNIQUE = int(os.environ.get("SUPPORT_SOLO_MIN_UNIQUE", "99"))
87
 
88
- # ====== Client ======
89
- if SESSION:
90
- session = StringSession(SESSION)
91
- else:
92
- session = MemorySession()
 
 
93
 
94
- client = TelegramClient(session, API_ID, API_HASH)
 
 
95
 
96
- # ====== State ======
 
97
  startup_time_utc = datetime.now(timezone.utc)
98
 
99
- recent_hashes = deque(maxlen=DEDUP_BUFFER_SIZE)
100
- recent_content_hashes = set()
101
 
102
- # chat_roles: chat_id -> "core"/"support"
103
- chat_roles: Dict[int, str] = {}
104
-
105
- # agregasi keyword -> group_id -> last_seen_utc
106
- keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
107
 
108
  def _db():
109
  conn = sqlite3.connect(DB_PATH)
110
  conn.execute("PRAGMA journal_mode=WAL;")
111
- conn.execute("PRAGMA synchronous=NORMAL;")
112
  return conn
113
 
114
  def _init_db():
@@ -141,29 +137,83 @@ def db_load_state():
141
  conn.close()
142
  return last, kw_map
143
 
144
- def db_save_group_seen(keyword: str, group_key: str, ts: datetime):
145
  conn = _db()
146
- conn.execute(
147
- "INSERT OR REPLACE INTO kw_group_seen(keyword, group_key, last_ts) VALUES (?, ?, ?)",
148
- (keyword, group_key, int(ts.timestamp())),
149
- )
150
  conn.commit()
151
  conn.close()
152
 
153
- def db_save_last_posted(keyword: str, msg_id: int, tier: str):
154
  conn = _db()
155
- conn.execute(
156
- "INSERT OR REPLACE INTO last_posted(keyword, msg_id, tier) VALUES (?, ?, ?)",
157
- (keyword, msg_id, tier),
158
- )
159
  conn.commit()
160
  conn.close()
161
 
162
- def score_relevance(t: str, keywords: List[str]) -> float:
163
- # exact: jumlah kemunculan kw
164
- exact_hits = sum(
165
- len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I)) for kw in keywords
166
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  exact_score = exact_hits * KEYWORD_WEIGHT
168
 
169
  # fuzzy windowed: ambil top-3 skor di antara jendela 20 token
@@ -183,9 +233,6 @@ def score_relevance(t: str, keywords: List[str]) -> float:
183
 
184
  return exact_score + fuzzy_score
185
 
186
- def _strip_urls_mentions_only(s: str) -> str:
187
- return _strip_urls_and_mentions(s)
188
-
189
  def hash_for_dedup(text: str, msg) -> str:
190
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
191
  parts = [text or ""]
@@ -195,53 +242,91 @@ def hash_for_dedup(text: str, msg) -> str:
195
  if doc and getattr(doc, "id", None) is not None:
196
  parts.append(f"doc:{doc.id}")
197
  if getattr(msg, "photo", None) is not None:
198
- parts.append("has_photo")
199
- h = hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
200
- return h
 
 
 
201
 
202
  def content_only_hash(text: str) -> str:
203
- t = text or ""
204
- # netralkan angka/CA/url/mention agar “teks ajakan” sama ke-dedup
205
- t = re.sub(r"\b\d{2,}\b", "NUM", t)
206
- t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
207
- t = _strip_urls_and_mentions(t)
208
- return hashlib.sha256(t.encode("utf-8")).hexdigest()
209
-
210
- def _windows(tokens: List[str], k: int):
211
- for i in range(0, len(tokens), max(1, k // 2)):
212
- yield " ".join(tokens[i:i + k])
213
-
214
- # ====== Media helpers ======
215
  def is_image_message(msg) -> bool:
216
  if getattr(msg, "photo", None) is not None:
217
  return True
218
  doc = getattr(msg, "document", None)
219
- if not doc:
220
- return False
221
- mt = (getattr(doc, "mime_type", "") or "").lower()
222
- if mt.startswith("image/"):
223
- return True
224
- if not ALLOW_GIFS_VIDEOS and (mt.startswith("video/") or mt == "image/gif"):
225
- return False
226
- return True
 
 
 
227
 
228
  def media_too_big(msg) -> bool:
229
  doc = getattr(msg, "document", None)
230
- if not doc:
231
- return False
232
- size = getattr(doc, "size", None)
233
- if size is None:
234
- return False
235
- return (size / (1024 * 1024)) > MAX_MEDIA_MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- # ====== Ajakan filter ======
 
238
  INVITE_PATTERNS = [
239
- r"\bjoin\b", r"\bdm\b", r"\binbox\b", r"\bpm\b", r"\bvip\b",
240
- r"\bcontact\b", r"\bpromo\b", r"\bpaid\b", r"@",
241
- r"t\.me\/", r"telegram\.me\/", r"\blink\b", r"\bklik\b", r"\bhubungi\b",
 
 
 
 
242
  r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
243
- r"\badvert\b", r"\badvertise\b", r"\badvertisement\b"
244
-
 
 
 
 
245
  ]
246
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
247
 
@@ -257,7 +342,7 @@ def _is_invite_sentence(s: str) -> bool:
257
  t = s.strip()
258
  if not t:
259
  return False
260
- # (ASAL) Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
261
  if any(r.search(t) for r in WHITELIST_REGEXES):
262
  return False
263
  # Jika ada 1+ pola ajakan, buang
@@ -272,35 +357,77 @@ def filter_invite_sentences(text: str) -> str:
272
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
273
  return cleaned
274
 
275
- # ====== Posting ======
276
- async def post_or_update(keyword: str, body: str, class_label: str, msg):
277
- """
278
- Post baru jika belum ada, jika sudah ada posting untuk keyword tsb → edit.
279
- """
280
- # muat last posted dari DB
281
- last_posted, _ = db_load_state()
282
- last = last_posted.get(keyword)
283
 
 
 
 
 
 
284
  if DRY_RUN:
285
- print(f"[DRY_RUN] ({class_label}) [{keyword}] {body[:160]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  return
287
 
288
- if not last:
289
- # post baru
290
- sent = await client.send_message(TARGET_CHAT, body)
291
- db_save_last_posted(keyword, sent.id, class_label)
292
- return
293
- else:
294
- # edit posting lama
295
- last_msg_id = last["msg_id"]
296
  try:
297
- await client.edit_message(TARGET_CHAT, last_msg_id, body)
298
- db_save_last_posted(keyword, last_msg_id, class_label)
 
 
 
 
 
299
  except Exception as e:
300
- print("Edit failed, posting baru:", e)
301
- sent = await client.send_message(TARGET_CHAT, body)
302
- db_save_last_posted(keyword, sent.id, class_label)
303
- return
 
 
 
 
304
 
305
  # ========= Core actions (fallback kept) =========
306
  async def send_as_is(msg, text_override: Optional[str] = None) -> None:
@@ -328,32 +455,78 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
328
  ext = ".jpg"
329
  mt = (getattr(doc, "mime_type", "") or "").lower()
330
  if mt:
331
- guessed = guess_extension(mt) or ""
332
- if guessed:
333
- ext = guessed
 
334
  bio.name = f"media{ext}"
335
  await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
336
  return
337
- except FloodWaitError as fw:
338
- print("FloodWait:", fw)
339
  except Exception as e:
340
- print("send_file err:", e)
341
 
342
- await client.send_message(TARGET_CHAT, orig_text)
 
 
 
 
343
 
344
- # ====== Keyword extraction / aggregation ======
345
- TICKER_RE = re.compile(r"\$[a-z0-9]{2,10}", re.IGNORECASE)
346
- COINWORDS = ["btc","eth","sol","bnb","pepe","doge","meme","spot","futures","pump","entry","buy","sell","tp","sl","setup"]
347
 
348
- def _extract_all_keywords(text_norm: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
349
  found = []
350
- found += [m.group(0).lower() for m in TICKER_RE.finditer(text_norm)]
351
- for w in COINWORDS:
352
- if re.search(rf"(^|\W){re.escape(w)}(\W|$)", text_norm, flags=re.I):
353
- found.append(w)
354
- # unik, preserve order
 
 
 
 
 
 
 
 
355
  seen = set()
356
  uniq = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  for kw in found:
358
  if kw not in seen:
359
  uniq.append(kw)
@@ -375,8 +548,7 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
375
  return chosen
376
 
377
  def _role_of(chat_id: int) -> str:
378
- # DULU: default ke "core" menyebabkan tembus saat resolve gagal.
379
- # BARU: default ke "support" (atau treat unknown as support)
380
  return chat_roles.get(chat_id, "support")
381
 
382
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
@@ -387,38 +559,12 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
387
  bucket = keyword_group_last_seen.get(keyword, {})
388
  core_ids, sup_ids = set(), set()
389
  for gk in bucket.keys():
390
- role = chat_roles.get(int(gk), "support")
391
  (core_ids if role == "core" else sup_ids).add(gk)
392
  return len(core_ids), len(sup_ids)
393
 
394
- def update_and_classify(keyword: str, group_key: str, ts: datetime) -> Tuple[str, int]:
395
- """
396
- Simpan last_seen dan kembalikan label kelas (HOT/SEDANG/BARU) + jumlah grup unik.
397
- """
398
- # purge window kadaluarsa
399
- cutoff = ts - timedelta(minutes=CLASS_WINDOW_MINUTES)
400
- bucket = keyword_group_last_seen[keyword]
401
- expired = [g for g, t in bucket.items() if t < cutoff]
402
- for g in expired:
403
- del bucket[g]
404
-
405
- # update
406
- bucket[group_key] = ts
407
- db_save_group_seen(keyword, group_key, ts)
408
-
409
- # hitung grup unik
410
- unique_groups = len(bucket)
411
 
412
- # simple classification
413
- if unique_groups >= 4:
414
- label = "🔥 HOT"
415
- elif unique_groups >= 2:
416
- label = "☀️ SEDANG"
417
- else:
418
- label = "🌱 BARU"
419
- return label, unique_groups
420
-
421
- def process_message(msg, source_chat_id: int) -> None:
422
  """
423
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
424
  agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
@@ -437,16 +583,12 @@ def process_message(msg, source_chat_id: int) -> None:
437
  if ch in recent_content_hashes:
438
  debug_log("Content-duplicate (global), dilewati", orig_text)
439
  return
440
- recent_content_hashes.add(ch)
441
- if len(recent_content_hashes) > DEDUP_BUFFER_SIZE * 2:
442
- # jaga ukuran set
443
- recent_content_hashes.clear()
444
- recent_content_hashes.add(ch)
445
-
446
- # Hash per pesan (teks+media) untuk guard cepat
447
- h = hash_for_dedup(orig_text, msg)
448
  if h in recent_hashes:
449
- debug_log("Duplicate (hash recent), dilewati", orig_text)
450
  return
451
  recent_hashes.append(h)
452
 
@@ -470,19 +612,22 @@ def process_message(msg, source_chat_id: int) -> None:
470
  now = datetime.now(timezone.utc)
471
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
472
 
473
- # Gating SUPPORT (strict core-anchored unless env allows 'solo')
474
- if role == "support":
475
  core_u, sup_u = _unique_counts_by_role(main_kw)
476
- if core_u < 1:
477
- # if solo support not allowed or not strong enough, hold
478
- if (not ALLOW_SUPPORT_SOLO) or (sup_u < SUPPORT_SOLO_MIN_UNIQUE):
479
- debug_log(f"Support ditahan (butuh anchor core; core_u={core_u}, sup_u={sup_u})", orig_text)
480
- return
 
 
 
481
 
482
- # Filter ajakan per-kalimat
483
- cleaned_body = filter_invite_sentences(orig_text).strip()
484
- if not cleaned_body:
485
- debug_log("Habis difilter, kosong dilewati", orig_text)
486
  return
487
 
488
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
@@ -493,21 +638,9 @@ def process_message(msg, source_chat_id: int) -> None:
493
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
494
  return
495
 
496
- asyncio.create_task(post_or_update(main_kw, cleaned_body, class_label, msg))
497
  debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
498
 
499
- # ========= Event handlers =========
500
- @client.on(events.NewMessage(chats=SOURCE_CHATS))
501
- async def on_new_message(event):
502
- try:
503
- await process_message(event.message, source_chat_id=event.chat_id)
504
- except Exception as e:
505
- print("Process error:", e)
506
-
507
- # ========= Utilities =========
508
- def debug_log(tag: str, body: str):
509
- ts = datetime.now().strftime("%H:%M:%S")
510
- print(f"[{ts}] {tag}: {body[:200].replace(chr(10),' / ')}")
511
 
512
  async def backfill_history(entity, limit: int) -> None:
513
  if limit <= 0:
@@ -519,6 +652,17 @@ async def backfill_history(entity, limit: int) -> None:
519
  except Exception as e:
520
  debug_log("Error saat memproses backfill", str(e))
521
 
 
 
 
 
 
 
 
 
 
 
 
522
  async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
523
  resolved = []
524
  for src in raw_list:
@@ -527,16 +671,37 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
527
  resolved.append(ent)
528
  chat_roles[int(ent.id)] = role_label
529
  except Exception as e:
530
- print(f"Gagal resolve {src}: {e}")
531
  return resolved
532
 
533
- # ========= Entry points =========
534
- async def app_main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  await client.start()
536
  _init_db()
537
- _ = db_load_state() # pre-load caches
538
 
539
- # resolve semua chats dan tag role
 
 
540
  resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
541
  resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
542
  resolved_sources = resolved_core + resolved_support
@@ -547,5 +712,6 @@ async def app_main():
547
  print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
548
  await client.run_until_disconnected()
549
 
 
550
  if __name__ == "__main__":
551
  asyncio.run(app_main())
 
 
1
  import asyncio
2
  import os
3
  import re
 
14
  from telethon.sessions import StringSession, MemorySession
15
  from telethon.errors.rpcerrorlist import FloodWaitError
16
 
17
+
18
+ # ========= Configuration via Environment =========
19
  API_ID = int(os.environ.get("API_ID", "0"))
20
  API_HASH = os.environ.get("API_HASH", "")
21
+ STRING_SESSION = os.environ.get("STRING_SESSION", "")
22
+
23
+ # --- Definisikan sumber sebagai CORE vs SUPPORT (pakai data milikmu) ---
24
+ CORE_CHATS = [
25
+ "https://t.me/PEPE_Calls28",
26
+ "https://t.me/HenryGems",
27
+ "https://t.me/ChinaPumpCommunity",
28
+ "https://t.me/SephirothGemCalls1",
29
+ "https://t.me/GM_Degencalls",
30
+ "https://t.me/Enthucalls",
31
+ "https://t.me/kobecalls",
32
+ "https://t.me/Kulture_Kall",
33
+ ]
34
+ SUPPORT_CHATS = [
35
+ "https://t.me/TheDonALPHAJournal",
36
+ "https://t.me/savascalls",
37
+ "https://t.me/Tanjirocall",
38
+ "https://t.me/ChapoInsider",
39
+ "https://t.me/millionsgems",
40
+ "https://t.me/Milagrosdegencalls",
41
+ "https://t.me/kariusgemscalls",
42
+ "https://t.me/Dwen_Exchange",
43
+ "https://t.me/bat_gamble",
44
+ "https://t.me/BatmanGamble",
45
+ "https://t.me/hulkgemscalls_real",
46
+ "https://t.me/MineGems",
47
+ ]
48
  SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
49
 
50
+ TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Kata kunci topik + simbol '$' tetap dipakai
53
  THEME_KEYWORDS = [
54
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
55
  "pump", "spot", "futures", "setup",
 
73
 
74
  CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
75
 
 
76
  SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
77
 
78
  # New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
 
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
 
 
 
84
 
85
+ # ========= Client bootstrap =========
86
+ def build_client() -> TelegramClient:
87
+ if STRING_SESSION:
88
+ print(">> Using StringSession (persistent).")
89
+ return TelegramClient(StringSession(STRING_SESSION), API_ID, API_HASH)
90
+ print(">> Using MemorySession (login tiap run).")
91
+ return TelegramClient(MemorySession(), API_ID, API_HASH)
92
 
93
+ client = build_client()
94
+ recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
95
+ recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # content-only dedup
96
 
97
+ # Peta id_chat -> "core" / "support"
98
+ chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
99
  startup_time_utc = datetime.now(timezone.utc)
100
 
 
 
101
 
102
+ # ========= Persistence (SQLite) =========
103
+ DB_PATH = os.environ.get("BOTSIGNAL_DB", "/tmp/botsignal.db")
 
 
 
104
 
105
  def _db():
106
  conn = sqlite3.connect(DB_PATH)
107
  conn.execute("PRAGMA journal_mode=WAL;")
 
108
  return conn
109
 
110
  def _init_db():
 
137
  conn.close()
138
  return last, kw_map
139
 
140
+ def db_save_last_posted(keyword: str, msg_id: int, tier: str):
141
  conn = _db()
142
+ conn.execute("INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
143
+ "ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
144
+ (keyword, msg_id, tier))
 
145
  conn.commit()
146
  conn.close()
147
 
148
+ def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
149
  conn = _db()
150
+ conn.execute("INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
151
+ "ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
152
+ (keyword, group_key, int(ts.timestamp())))
 
153
  conn.commit()
154
  conn.close()
155
 
156
+ def db_prune_expired(cutoff: datetime):
157
+ conn = _db()
158
+ conn.execute("DELETE FROM kw_group_seen WHERE last_ts < ?", (int(cutoff.timestamp()),))
159
+ conn.commit()
160
+ conn.close()
161
+
162
+
163
+ # ========= Utilities =========
164
+ def debug_log(reason: str, content: str = "") -> None:
165
+ short = (content or "").replace("\n", " ")[:160]
166
+ print(f"[DEBUG] {reason}: {short}")
167
+
168
+ def normalize_for_filter(text: str) -> str:
169
+ if not text:
170
+ return ""
171
+ s = re.sub(r"(?m)^>.*", "", text)
172
+ s = re.sub(r"\s+", " ", s).strip()
173
+ return s
174
+
175
+ def _tokenize_words(s: str) -> List[str]:
176
+ return re.findall(r"[a-z0-9\$\#]{1,64}", s.lower())
177
+
178
+ def _windows(tokens: List[str], size: int = 20):
179
+ for i in range(0, len(tokens), size):
180
+ yield " ".join(tokens[i:i+size])
181
+
182
+ # --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
183
+ CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
184
+ CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
185
+ CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
186
+
187
+ def _strip_urls_and_mentions(s: str) -> str:
188
+ s = re.sub(r"https?://\S+", "", s)
189
+ s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
190
+ s = re.sub(r"@[A-Za-z0-9_]+", "", s)
191
+ return re.sub(r"\s+", " ", s).strip()
192
+
193
+ def strip_contracts_for_scoring(s: str) -> str:
194
+ """
195
+ Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
196
+ agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
197
+ """
198
+ s0 = _strip_urls_and_mentions(s)
199
+ s1 = CA_LABEL_RE.sub(" ", s0)
200
+ s2 = CA_EVM_RE.sub(" ", s1)
201
+ s3 = CA_SOL_RE.sub(" ", s2)
202
+ return re.sub(r"\s+", " ", s3).strip()
203
+
204
+ def score_relevance(text: str, keywords: List[str]) -> float:
205
+ """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
206
+ if not text:
207
+ return 0.0
208
+
209
+ # Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
210
+ t = strip_contracts_for_scoring(text).lower()
211
+
212
+ # exact hits (unik)
213
+ exact_hits = 0
214
+ for kw in set(keywords):
215
+ if kw in t or re.search(rf"\b{re.escape(kw)}\b", t):
216
+ exact_hits += 1
217
  exact_score = exact_hits * KEYWORD_WEIGHT
218
 
219
  # fuzzy windowed: ambil top-3 skor di antara jendela 20 token
 
233
 
234
  return exact_score + fuzzy_score
235
 
 
 
 
236
  def hash_for_dedup(text: str, msg) -> str:
237
  """Hash campuran (lama) – menahan duplikat per pesan+media."""
238
  parts = [text or ""]
 
242
  if doc and getattr(doc, "id", None) is not None:
243
  parts.append(f"doc:{doc.id}")
244
  if getattr(msg, "photo", None) is not None:
245
+ ph = msg.photo
246
+ ph_id = getattr(ph, "id", None)
247
+ if ph_id is not None:
248
+ parts.append(f"photo:{ph_id}")
249
+ raw = "|".join(parts).encode("utf-8", errors="ignore")
250
+ return hashlib.sha1(raw).hexdigest()
251
 
252
  def content_only_hash(text: str) -> str:
253
+ """Hash berbasis isi saja (untuk lintas-grup crosspost)."""
254
+ norm = _strip_urls_and_mentions(normalize_for_filter(text))
255
+ return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
256
+
 
 
 
 
 
 
 
 
257
  def is_image_message(msg) -> bool:
258
  if getattr(msg, "photo", None) is not None:
259
  return True
260
  doc = getattr(msg, "document", None)
261
+ if doc and getattr(doc, "mime_type", None):
262
+ mt = (doc.mime_type or "").lower()
263
+ if mt.startswith("image/"):
264
+ if SKIP_STICKERS and ("webp" in mt or "sticker" in mt):
265
+ return False
266
+ return True
267
+ if not ALLOW_GIFS_VIDEOS:
268
+ return False
269
+ if mt in ("video/mp4", "image/gif"):
270
+ return True
271
+ return False
272
 
273
  def media_too_big(msg) -> bool:
274
  doc = getattr(msg, "document", None)
275
+ if doc and getattr(doc, "size", None):
276
+ return (doc.size or 0) > MAX_MEDIA_MB * 1024 * 1024
277
+ return False
278
+
279
+
280
+ # ========= Class aggregator (windowed unique groups) =========
281
+ keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
282
+
283
+ def _prune_expired(now: datetime) -> None:
284
+ window = timedelta(minutes=CLASS_WINDOW_MINUTES)
285
+ cutoff = now - window
286
+ # in-memory prune
287
+ for kw, m in list(keyword_group_last_seen.items()):
288
+ for gk, ts in list(m.items()):
289
+ if ts < cutoff:
290
+ del m[gk]
291
+ if not m:
292
+ del keyword_group_last_seen[kw]
293
+ # db prune
294
+ db_prune_expired(cutoff)
295
+
296
+ def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
297
+ if not now:
298
+ now = datetime.now(timezone.utc)
299
+ _prune_expired(now)
300
+
301
+ bucket = keyword_group_last_seen[keyword]
302
+ bucket[group_key] = now
303
+ db_upsert_kw_seen(keyword, group_key, now)
304
+
305
+ unique_groups = len(bucket)
306
+ if unique_groups >= 4:
307
+ return "kuat", unique_groups
308
+ elif unique_groups >= 2:
309
+ return "sedang", unique_groups
310
+ else:
311
+ return "rendah", unique_groups
312
 
313
+
314
+ # ========= Sentence-level invite filter (smarter) =========
315
  INVITE_PATTERNS = [
316
+ r"\bjoin\b", r"\bjoin (us|our|channel|group)\b",
317
+ r"\bdm\b", r"\bdm (me|gw|gue|gua|saya|admin)\b",
318
+ r"\bpm\b", r"\binbox\b", r"\bcontact\b", r"\bkontak\b", r"\bhubungi\b",
319
+ r"\bvip\b", r"\bpremium\b", r"\bberbayar\b", r"\bpaid\b", r"\bexclusive\b",
320
+ r"\bwhitelist\b", r"\bprivate( group| channel)?\b", r"\bmembership?\b",
321
+ r"\bsubscribe\b", r"\blangganan\b",
322
+ # kata kunci promo/iklan
323
  r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
324
+ r"\badvert\b", r"\badvertise\b", r"\badvertisement\b",
325
+ # tautan undangan/shortener
326
+ r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
327
+ r"(bit\.ly|tinyurl\.com|linktr\.ee)",
328
+ # perluasan: link t.me biasa
329
+ r"t\.me\/[A-Za-z0-9_]+"
330
  ]
331
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
332
 
 
342
  t = s.strip()
343
  if not t:
344
  return False
345
+ # Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
346
  if any(r.search(t) for r in WHITELIST_REGEXES):
347
  return False
348
  # Jika ada 1+ pola ajakan, buang
 
357
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
358
  return cleaned
359
 
 
 
 
 
 
 
 
 
360
 
361
+ # ========= Post-on-threshold with EDIT (persisted) =========
362
+ TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
363
+ last_posted: Dict[str, Dict[str, object]] = {} # keyword -> {"msg_id": int, "tier": str}
364
+
365
+ async def _send_initial(msg, text: str) -> int:
366
  if DRY_RUN:
367
+ print("[DRY_RUN] send_initial:", text[:140])
368
+ return -1
369
+ # kirim media bila ada & allowed
370
+ if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
371
+ try:
372
+ if getattr(msg, "photo", None):
373
+ m = await client.send_file(TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False)
374
+ return m.id
375
+ doc = getattr(msg, "document", None)
376
+ if doc:
377
+ data = await client.download_media(msg, file=bytes)
378
+ if data:
379
+ bio = io.BytesIO(data)
380
+ ext = ".jpg"
381
+ mt = (getattr(doc, "mime_type", "") or "").lower()
382
+ if mt:
383
+ ext_guess = guess_extension(mt) or ".jpg"
384
+ if ext_guess == ".jpe":
385
+ ext_guess = ".jpg"
386
+ ext = ext_guess
387
+ bio.name = f"media{ext}"
388
+ m = await client.send_file(TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False)
389
+ return m.id
390
+ except FloodWaitError as e:
391
+ await asyncio.sleep(e.seconds + 1)
392
+ return await _send_initial(msg, text)
393
+ except Exception as e:
394
+ debug_log("Gagal kirim media awal, fallback text", str(e))
395
+ try:
396
+ m = await client.send_message(TARGET_CHAT, text, link_preview=True)
397
+ return m.id
398
+ except FloodWaitError as e:
399
+ await asyncio.sleep(e.seconds + 1)
400
+ return await _send_initial(msg, text)
401
+
402
+ async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
403
+ prefix = f"[{new_tier.upper()}] "
404
+ text = prefix + body
405
+ prev = last_posted.get(keyword)
406
+ if not prev:
407
+ msg_id = await _send_initial(src_msg, text)
408
+ last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
409
+ if msg_id != -1:
410
+ db_save_last_posted(keyword, msg_id, new_tier)
411
  return
412
 
413
+ if TIER_ORDER.get(new_tier, 0) > TIER_ORDER.get(prev["tier"], 0):
 
 
 
 
 
 
 
414
  try:
415
+ await client.edit_message(TARGET_CHAT, prev["msg_id"], text)
416
+ prev["tier"] = new_tier
417
+ if prev["msg_id"] != -1:
418
+ db_save_last_posted(keyword, prev["msg_id"], new_tier)
419
+ except FloodWaitError as e:
420
+ await asyncio.sleep(e.seconds + 1)
421
+ await post_or_update(keyword, body, new_tier, src_msg)
422
  except Exception as e:
423
+ debug_log("Edit gagal, fallback kirim baru", str(e))
424
+ msg_id = await _send_initial(src_msg, text)
425
+ last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
426
+ if msg_id != -1:
427
+ db_save_last_posted(keyword, msg_id, new_tier)
428
+ else:
429
+ pass # no-op
430
+
431
 
432
  # ========= Core actions (fallback kept) =========
433
  async def send_as_is(msg, text_override: Optional[str] = None) -> None:
 
455
  ext = ".jpg"
456
  mt = (getattr(doc, "mime_type", "") or "").lower()
457
  if mt:
458
+ ext_guess = guess_extension(mt) or ".jpg"
459
+ if ext_guess == ".jpe":
460
+ ext_guess = ".jpg"
461
+ ext = ext_guess
462
  bio.name = f"media{ext}"
463
  await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
464
  return
465
+ except FloodWaitError as e:
466
+ await asyncio.sleep(e.seconds + 1)
467
  except Exception as e:
468
+ debug_log("Gagal kirim sebagai media, fallback ke text", str(e))
469
 
470
+ try:
471
+ await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
472
+ except FloodWaitError as e:
473
+ await asyncio.sleep(e.seconds + 1)
474
+ await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
475
 
 
 
 
476
 
477
+ # ========= Keyword extraction ($ticker-aware) =========
478
+ TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
479
+ TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
480
+
481
+ def _extract_tickers(text_norm: str) -> List[str]:
482
+ """
483
+ Ambil $TICKER dengan dua cara:
484
+ - Bersih: $ABC, $JBCOIN
485
+ - Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
486
+ (Teks asli tetap dikirim apa adanya.)
487
+ """
488
  found = []
489
+
490
+ # bersih
491
+ for m in TICKER_CLEAN_RE.finditer(text_norm):
492
+ found.append(m.group(0).lower())
493
+
494
+ # noisy -> normalisasi internal
495
+ for m in TICKER_NOISY_RE.finditer(text_norm):
496
+ raw = m.group(0)
497
+ norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
498
+ if 3 <= len(norm) <= 13: # termasuk '$'
499
+ found.append(norm.lower())
500
+
501
+ # unik & pertahankan urutan
502
  seen = set()
503
  uniq = []
504
+ for x in found:
505
+ if x not in seen:
506
+ uniq.append(x)
507
+ seen.add(x)
508
+ return uniq
509
+
510
+ def _extract_all_keywords(text_norm: str) -> List[str]:
511
+ """
512
+ Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
513
+ Tidak menghapus simbol '$' (sesuai permintaan).
514
+ """
515
+ # toleran untuk pencarian keyword tema (seperti semula)
516
+ t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
517
+
518
+ found = []
519
+ for kw in THEME_KEYWORDS:
520
+ if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
521
+ found.append(kw.lower())
522
+
523
+ # gabungkan hasil $ticker
524
+ tickers = _extract_tickers(text_norm)
525
+ found.extend(tickers)
526
+
527
+ # unik dengan urutan muncul pertama
528
+ uniq = []
529
+ seen = set()
530
  for kw in found:
531
  if kw not in seen:
532
  uniq.append(kw)
 
548
  return chosen
549
 
550
  def _role_of(chat_id: int) -> str:
551
+ # DEFAULT KE SUPPORT agar tidak salah meloloskan chat yang tidak tertag
 
552
  return chat_roles.get(chat_id, "support")
553
 
554
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
 
559
  bucket = keyword_group_last_seen.get(keyword, {})
560
  core_ids, sup_ids = set(), set()
561
  for gk in bucket.keys():
562
+ role = chat_roles.get(int(gk), "support") # default support untuk aman
563
  (core_ids if role == "core" else sup_ids).add(gk)
564
  return len(core_ids), len(sup_ids)
565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
+ async def process_message(msg, source_chat_id: int) -> None:
 
 
 
 
 
 
 
 
 
568
  """
569
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
570
  agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
 
583
  if ch in recent_content_hashes:
584
  debug_log("Content-duplicate (global), dilewati", orig_text)
585
  return
586
+ recent_content_hashes.append(ch)
587
+
588
+ # Dedup lama (per pesan/media)
589
+ h = hash_for_dedup(text_norm, msg)
 
 
 
 
590
  if h in recent_hashes:
591
+ debug_log("Duplikat (pesan/media), dilewati", orig_text)
592
  return
593
  recent_hashes.append(h)
594
 
 
612
  now = datetime.now(timezone.utc)
613
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
614
 
615
+ # Gating SUPPORT (CORE-anchored)
616
+ if role != "core":
617
  core_u, sup_u = _unique_counts_by_role(main_kw)
618
+ # Aturan:
619
+ # - Jika sudah ada minimal 1 sebutan dari CORE untuk keyword ini -> izinkan.
620
+ # - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
621
+ if core_u >= 1:
622
+ pass
623
+ elif sup_u < SUPPORT_MIN_UNIQUE:
624
+ debug_log(f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})", orig_text)
625
+ return
626
 
627
+ # Filter kalimat ajakan (whitelist-aware)
628
+ cleaned_body = filter_invite_sentences(orig_text)
629
+ if not cleaned_body.strip():
630
+ debug_log("Semua kalimat terfilter (kosong), dilewati", orig_text)
631
  return
632
 
633
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
 
638
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
639
  return
640
 
641
+ await post_or_update(main_kw, cleaned_body, class_label, msg)
642
  debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
643
 
 
 
 
 
 
 
 
 
 
 
 
 
644
 
645
  async def backfill_history(entity, limit: int) -> None:
646
  if limit <= 0:
 
652
  except Exception as e:
653
  debug_log("Error saat memproses backfill", str(e))
654
 
655
+
656
+ # ========= Event handlers =========
657
+ @client.on(events.NewMessage(chats=SOURCE_CHATS))
658
+ async def on_new_message(event):
659
+ try:
660
+ await process_message(event.message, source_chat_id=event.chat_id)
661
+ except Exception as e:
662
+ print("Process error:", e)
663
+
664
+
665
+ # ========= Entry points =========
666
  async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
667
  resolved = []
668
  for src in raw_list:
 
671
  resolved.append(ent)
672
  chat_roles[int(ent.id)] = role_label
673
  except Exception as e:
674
+ print(f"Gagal resolve sumber {src}: {e}")
675
  return resolved
676
 
677
+ async def start_bot_background() -> None:
678
+ await client.start()
679
+ _init_db()
680
+
681
+ # Load persisted state
682
+ global last_posted, keyword_group_last_seen
683
+ last_posted, keyword_group_last_seen = db_load_state()
684
+
685
+ resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
686
+ resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
687
+ resolved_sources = resolved_core + resolved_support
688
+
689
+ for ent in resolved_sources:
690
+ try:
691
+ await backfill_history(ent, INITIAL_BACKFILL)
692
+ except Exception as e:
693
+ print(f"Backfill gagal untuk {ent}: {e}")
694
+
695
+ print("Kurator berjalan (background task). Menunggu pesan baru...")
696
+ asyncio.create_task(client.run_until_disconnected())
697
+
698
+ async def app_main() -> None:
699
  await client.start()
700
  _init_db()
 
701
 
702
+ global last_posted, keyword_group_last_seen
703
+ last_posted, keyword_group_last_seen = db_load_state()
704
+
705
  resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
706
  resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
707
  resolved_sources = resolved_core + resolved_support
 
712
  print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
713
  await client.run_until_disconnected()
714
 
715
+
716
  if __name__ == "__main__":
717
  asyncio.run(app_main())