agus1111 commited on
Commit
8d523f6
·
verified ·
1 Parent(s): d46bd90

Update botsignal.py

Browse files
Files changed (1) hide show
  1. botsignal.py +207 -373
botsignal.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import asyncio
2
  import os
3
  import re
@@ -14,42 +15,40 @@ from telethon import TelegramClient, events
14
  from telethon.sessions import StringSession, MemorySession
15
  from telethon.errors.rpcerrorlist import FloodWaitError
16
 
17
-
18
- # ========= Configuration via Environment =========
19
  API_ID = int(os.environ.get("API_ID", "0"))
20
  API_HASH = os.environ.get("API_HASH", "")
21
- STRING_SESSION = os.environ.get("STRING_SESSION", "")
22
-
23
- # --- Definisikan sumber sebagai CORE vs SUPPORT (pakai data milikmu) ---
24
- CORE_CHATS = [
25
- "https://t.me/PEPE_Calls28",
26
- "https://t.me/HenryGems",
27
- "https://t.me/ChinaPumpCommunity",
28
- "https://t.me/SephirothGemCalls1",
29
- "https://t.me/GM_Degencalls",
30
- "https://t.me/Enthucalls",
31
- "https://t.me/kobecalls",
32
- "https://t.me/Kulture_Kall",
33
- ]
34
- SUPPORT_CHATS = [
35
- "https://t.me/TheDonALPHAJournal",
36
- "https://t.me/savascalls",
37
- "https://t.me/Tanjirocall",
38
- "https://t.me/ChapoInsider",
39
- "https://t.me/millionsgems",
40
- "https://t.me/Milagrosdegencalls",
41
- "https://t.me/kariusgemscalls",
42
- "https://t.me/Dwen_Exchange",
43
- "https://t.me/bat_gamble",
44
- "https://t.me/BatmanGamble",
45
- "https://t.me/hulkgemscalls_real",
46
- "https://t.me/MineGems",
47
- ]
48
  SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
49
 
50
- TARGET_CHAT = os.environ.get("TARGET_CHAT", "https://t.me/MidasTouchsignalll")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Kata kunci topik + simbol '$' tetap dipakai
53
  THEME_KEYWORDS = [
54
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
55
  "pump", "spot", "futures", "setup",
@@ -73,6 +72,7 @@ DEDUP_BUFFER_SIZE = int(os.environ.get("DEDUP_BUFFER_SIZE", "800"))
73
 
74
  CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
75
 
 
76
  SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
77
 
78
  # New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
@@ -81,30 +81,34 @@ DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
 
 
 
84
 
85
- # ========= Client bootstrap =========
86
- def build_client() -> TelegramClient:
87
- if STRING_SESSION:
88
- print(">> Using StringSession (persistent).")
89
- return TelegramClient(StringSession(STRING_SESSION), API_ID, API_HASH)
90
- print(">> Using MemorySession (login tiap run).")
91
- return TelegramClient(MemorySession(), API_ID, API_HASH)
92
 
93
- client = build_client()
94
- recent_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE)
95
- recent_content_hashes: deque[str] = deque(maxlen=DEDUP_BUFFER_SIZE) # NEW: content-only dedup
96
 
97
- # Peta id_chat -> "core" / "support"
98
- chat_roles: Dict[int, str] = {} # diisi saat startup setelah resolve entity
99
  startup_time_utc = datetime.now(timezone.utc)
100
 
 
 
101
 
102
- # ========= Persistence (SQLite) =========
103
- DB_PATH = os.environ.get("BOTSIGNAL_DB", "/tmp/botsignal.db")
 
 
 
104
 
105
  def _db():
106
  conn = sqlite3.connect(DB_PATH)
107
  conn.execute("PRAGMA journal_mode=WAL;")
 
108
  return conn
109
 
110
  def _init_db():
@@ -137,83 +141,29 @@ def db_load_state():
137
  conn.close()
138
  return last, kw_map
139
 
140
- def db_save_last_posted(keyword: str, msg_id: int, tier: str):
141
- conn = _db()
142
- conn.execute("INSERT INTO last_posted(keyword, msg_id, tier) VALUES(?,?,?) "
143
- "ON CONFLICT(keyword) DO UPDATE SET msg_id=excluded.msg_id, tier=excluded.tier",
144
- (keyword, msg_id, tier))
145
- conn.commit()
146
- conn.close()
147
-
148
- def db_upsert_kw_seen(keyword: str, group_key: str, ts: datetime):
149
  conn = _db()
150
- conn.execute("INSERT INTO kw_group_seen(keyword, group_key, last_ts) VALUES(?,?,?) "
151
- "ON CONFLICT(keyword, group_key) DO UPDATE SET last_ts=excluded.last_ts",
152
- (keyword, group_key, int(ts.timestamp())))
 
153
  conn.commit()
154
  conn.close()
155
 
156
- def db_prune_expired(cutoff: datetime):
157
  conn = _db()
158
- conn.execute("DELETE FROM kw_group_seen WHERE last_ts < ?", (int(cutoff.timestamp()),))
 
 
 
159
  conn.commit()
160
  conn.close()
161
 
162
-
163
- # ========= Utilities =========
164
- def debug_log(reason: str, content: str = "") -> None:
165
- short = (content or "").replace("\n", " ")[:160]
166
- print(f"[DEBUG] {reason}: {short}")
167
-
168
- def normalize_for_filter(text: str) -> str:
169
- if not text:
170
- return ""
171
- s = re.sub(r"(?m)^>.*", "", text)
172
- s = re.sub(r"\s+", " ", s).strip()
173
- return s
174
-
175
- def _tokenize_words(s: str) -> List[str]:
176
- return re.findall(r"[a-z0-9\$\#]{1,64}", s.lower())
177
-
178
- def _windows(tokens: List[str], size: int = 20):
179
- for i in range(0, len(tokens), size):
180
- yield " ".join(tokens[i:i+size])
181
-
182
- # --- Tambahan: bersihkan URL/CA untuk kepentingan SCORING relevansi ---
183
- CA_SOL_RE = re.compile(r"\b[1-9A-HJ-NP-Za-km-z]{32,48}\b") # Solana base58 (perkiraan)
184
- CA_EVM_RE = re.compile(r"\b0x[a-fA-F0-9]{40}\b") # EVM address
185
- CA_LABEL_RE = re.compile(r"\bCA\s*[:=]\s*\S+", re.IGNORECASE) # "CA: ..." potong tokennya
186
-
187
- def _strip_urls_and_mentions(s: str) -> str:
188
- s = re.sub(r"https?://\S+", "", s)
189
- s = re.sub(r"t\.me/[A-Za-z0-9_]+", "", s)
190
- s = re.sub(r"@[A-Za-z0-9_]+", "", s)
191
- return re.sub(r"\s+", " ", s).strip()
192
-
193
- def strip_contracts_for_scoring(s: str) -> str:
194
- """
195
- Hilangkan URL/mention, alamat kontrak, dan token setelah 'CA:'
196
- agar kata 'pump' pada CA/URL (mis. pump.fun) tidak memengaruhi skor.
197
- """
198
- s0 = _strip_urls_and_mentions(s)
199
- s1 = CA_LABEL_RE.sub(" ", s0)
200
- s2 = CA_EVM_RE.sub(" ", s1)
201
- s3 = CA_SOL_RE.sub(" ", s2)
202
- return re.sub(r"\s+", " ", s3).strip()
203
-
204
- def score_relevance(text: str, keywords: List[str]) -> float:
205
- """Skor: exact keyword + fuzzy windowed (top-3 rata-rata) agar adil untuk teks panjang."""
206
- if not text:
207
- return 0.0
208
-
209
- # Gunakan versi yang TIDAK mengandung URL/CA agar 'pump' di CA tidak ikut dihitung
210
- t = strip_contracts_for_scoring(text).lower()
211
-
212
- # exact hits (unik)
213
- exact_hits = 0
214
- for kw in set(keywords):
215
- if kw in t or re.search(rf"\b{re.escape(kw)}\b", t):
216
- exact_hits += 1
217
  exact_score = exact_hits * KEYWORD_WEIGHT
218
 
219
  # fuzzy windowed: ambil top-3 skor di antara jendela 20 token
@@ -245,89 +195,53 @@ def hash_for_dedup(text: str, msg) -> str:
245
  if doc and getattr(doc, "id", None) is not None:
246
  parts.append(f"doc:{doc.id}")
247
  if getattr(msg, "photo", None) is not None:
248
- ph = msg.photo
249
- ph_id = getattr(ph, "id", None)
250
- if ph_id is not None:
251
- parts.append(f"photo:{ph_id}")
252
- raw = "|".join(parts).encode("utf-8", errors="ignore")
253
- return hashlib.sha1(raw).hexdigest()
254
 
255
  def content_only_hash(text: str) -> str:
256
- """Hash berbasis isi saja (untuk lintas-grup crosspost)."""
257
- norm = _strip_urls_mentions_only(normalize_for_filter(text))
258
- return hashlib.sha1(norm.encode("utf-8", errors="ignore")).hexdigest()
259
-
 
 
 
 
 
 
 
 
260
  def is_image_message(msg) -> bool:
261
  if getattr(msg, "photo", None) is not None:
262
  return True
263
  doc = getattr(msg, "document", None)
264
- if doc and getattr(doc, "mime_type", None):
265
- mt = (doc.mime_type or "").lower()
266
- if mt.startswith("image/"):
267
- if SKIP_STICKERS and ("webp" in mt or "sticker" in mt):
268
- return False
269
- return True
270
- if not ALLOW_GIFS_VIDEOS:
271
- return False
272
- if mt in ("video/mp4", "image/gif"):
273
- return True
274
- return False
275
 
276
  def media_too_big(msg) -> bool:
277
  doc = getattr(msg, "document", None)
278
- if doc and getattr(doc, "size", None):
279
- return (doc.size or 0) > MAX_MEDIA_MB * 1024 * 1024
280
- return False
281
-
282
-
283
- # ========= Class aggregator (windowed unique groups) =========
284
- keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
285
-
286
- def _prune_expired(now: datetime) -> None:
287
- window = timedelta(minutes=CLASS_WINDOW_MINUTES)
288
- cutoff = now - window
289
- # in-memory prune
290
- for kw, m in list(keyword_group_last_seen.items()):
291
- for gk, ts in list(m.items()):
292
- if ts < cutoff:
293
- del m[gk]
294
- if not m:
295
- del keyword_group_last_seen[kw]
296
- # db prune
297
- db_prune_expired(cutoff)
298
-
299
- def update_and_classify(keyword: str, group_key: str, now: Optional[datetime] = None) -> Tuple[str, int]:
300
- if not now:
301
- now = datetime.now(timezone.utc)
302
- _prune_expired(now)
303
-
304
- bucket = keyword_group_last_seen[keyword]
305
- bucket[group_key] = now
306
- db_upsert_kw_seen(keyword, group_key, now)
307
-
308
- unique_groups = len(bucket)
309
- if unique_groups >= 4:
310
- return "kuat", unique_groups
311
- elif unique_groups >= 2:
312
- return "sedang", unique_groups
313
- else:
314
- return "rendah", unique_groups
315
-
316
 
317
- # ========= Sentence-level invite filter (smarter) =========
318
  INVITE_PATTERNS = [
319
- r"\bjoin\b", r"\bjoin (us|our|channel|group)\b",
320
- r"\bdm\b", r"\bdm (me|gw|gue|gua|saya|admin)\b",
321
- r"\bpm\b", r"\binbox\b", r"\bcontact\b", r"\bkontak\b", r"\bhubungi\b",
322
- r"\bvip\b", r"\bpremium\b", r"\bberbayar\b", r"\bpaid\b", r"\bexclusive\b",
323
- r"\bwhitelist\b", r"\bprivate( group| channel)?\b", r"\bmembership?\b",
324
- r"\bsubscribe\b", r"\blangganan\b",
325
- r"(t\.me\/joinchat|t\.me\/\+|telegram\.me\/|discord\.gg\/|wa\.me\/|whatsapp\.com\/)",
326
- r"(bit\.ly|tinyurl\.com|linktr\.ee)",
327
- r"From TG channel:",
328
  r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
329
- r"\badvert\b", r"\badvertise\b", r"\badvertisement\b",
330
- r"t\.me\/[A-Za-z0-9_]+"
331
  ]
332
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
333
 
@@ -343,7 +257,7 @@ def _is_invite_sentence(s: str) -> bool:
343
  t = s.strip()
344
  if not t:
345
  return False
346
- # Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
347
  if any(r.search(t) for r in WHITELIST_REGEXES):
348
  return False
349
  # Jika ada 1+ pola ajakan, buang
@@ -358,77 +272,35 @@ def filter_invite_sentences(text: str) -> str:
358
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
359
  return cleaned
360
 
 
 
 
 
 
 
 
 
361
 
362
- # ========= Post-on-threshold with EDIT (persisted) =========
363
- TIER_ORDER = {"rendah": 0, "sedang": 1, "kuat": 2}
364
- last_posted: Dict[str, Dict[str, object]] = {}
365
-
366
- async def _send_initial(msg, text: str) -> int:
367
  if DRY_RUN:
368
- print("[DRY_RUN] send_initial:", text[:140])
369
- return -1
370
- # kirim media bila ada & allowed
371
- if INCLUDE_MEDIA and is_image_message(msg) and not media_too_big(msg):
372
- try:
373
- if getattr(msg, "photo", None):
374
- m = await client.send_file(TARGET_CHAT, msg.photo, caption=text, caption_entities=None, force_document=False)
375
- return m.id
376
- doc = getattr(msg, "document", None)
377
- if doc:
378
- data = await client.download_media(msg, file=bytes)
379
- if data:
380
- bio = io.BytesIO(data)
381
- ext = ".jpg"
382
- mt = (getattr(doc, "mime_type", "") or "").lower()
383
- if mt:
384
- ext_guess = guess_extension(mt) or ".jpg"
385
- if ext_guess == ".jpe":
386
- ext_guess = ".jpg"
387
- ext = ext_guess
388
- bio.name = f"media{ext}"
389
- m = await client.send_file(TARGET_CHAT, bio, caption=text, caption_entities=None, force_document=False)
390
- return m.id
391
- except FloodWaitError as e:
392
- await asyncio.sleep(e.seconds + 1)
393
- return await _send_initial(msg, text)
394
- except Exception as e:
395
- debug_log("Gagal kirim media awal, fallback text", str(e))
396
- try:
397
- m = await client.send_message(TARGET_CHAT, text, link_preview=True)
398
- return m.id
399
- except FloodWaitError as e:
400
- await asyncio.sleep(e.seconds + 1)
401
- return await _send_initial(msg, text)
402
-
403
- async def post_or_update(keyword: str, body: str, new_tier: str, src_msg) -> None:
404
- prefix = f"[{new_tier.upper()}] "
405
- text = prefix + body
406
- prev = last_posted.get(keyword)
407
- if not prev:
408
- msg_id = await _send_initial(src_msg, text)
409
- last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
410
- if msg_id != -1:
411
- db_save_last_posted(keyword, msg_id, new_tier)
412
  return
413
 
414
- if TIER_ORDER.get(new_tier, 0) > TIER_ORDER.get(prev["tier"], 0):
 
 
 
 
 
 
 
415
  try:
416
- await client.edit_message(TARGET_CHAT, prev["msg_id"], text)
417
- prev["tier"] = new_tier
418
- if prev["msg_id"] != -1:
419
- db_save_last_posted(keyword, prev["msg_id"], new_tier)
420
- except FloodWaitError as e:
421
- await asyncio.sleep(e.seconds + 1)
422
- await post_or_update(keyword, body, new_tier, src_msg)
423
  except Exception as e:
424
- debug_log("Edit gagal, fallback kirim baru", str(e))
425
- msg_id = await _send_initial(src_msg, text)
426
- last_posted[keyword] = {"msg_id": msg_id, "tier": new_tier}
427
- if msg_id != -1:
428
- db_save_last_posted(keyword, msg_id, new_tier)
429
- else:
430
- pass # no-op
431
-
432
 
433
  # ========= Core actions (fallback kept) =========
434
  async def send_as_is(msg, text_override: Optional[str] = None) -> None:
@@ -456,78 +328,32 @@ async def send_as_is(msg, text_override: Optional[str] = None) -> None:
456
  ext = ".jpg"
457
  mt = (getattr(doc, "mime_type", "") or "").lower()
458
  if mt:
459
- ext_guess = guess_extension(mt) or ".jpg"
460
- if ext_guess == ".jpe":
461
- ext_guess = ".jpg"
462
- ext = ext_guess
463
  bio.name = f"media{ext}"
464
  await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
465
  return
466
- except FloodWaitError as e:
467
- await asyncio.sleep(e.seconds + 1)
468
  except Exception as e:
469
- debug_log("Gagal kirim sebagai media, fallback ke text", str(e))
470
-
471
- try:
472
- await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
473
- except FloodWaitError as e:
474
- await asyncio.sleep(e.seconds + 1)
475
- await client.send_message(TARGET_CHAT, orig_text, formatting_entities=entities, link_preview=True)
476
 
 
477
 
478
- # ========= Keyword extraction ($ticker-aware) =========
479
- TICKER_CLEAN_RE = re.compile(r"\$[A-Za-z0-9]{2,12}")
480
- TICKER_NOISY_RE = re.compile(r"\$[A-Za-z0-9](?:[^A-Za-z0-9]+[A-Za-z0-9]){1,11}")
481
-
482
- def _extract_tickers(text_norm: str) -> List[str]:
483
- """
484
- Ambil $TICKER dengan dua cara:
485
- - Bersih: $ABC, $JBCOIN
486
- - Noisy: $J*BCOIN -> dinormalisasi jadi $JBCOIN untuk *keyword* saja.
487
- (Teks asli tetap dikirim apa adanya.)
488
- """
489
- found = []
490
-
491
- # bersih
492
- for m in TICKER_CLEAN_RE.finditer(text_norm):
493
- found.append(m.group(0).lower())
494
-
495
- # noisy -> normalisasi internal
496
- for m in TICKER_NOISY_RE.finditer(text_norm):
497
- raw = m.group(0)
498
- norm = "$" + re.sub(r"[^A-Za-z0-9]+", "", raw[1:])
499
- if 3 <= len(norm) <= 13: # termasuk '$'
500
- found.append(norm.lower())
501
-
502
- # unik & pertahankan urutan
503
- seen = set()
504
- uniq = []
505
- for x in found:
506
- if x not in seen:
507
- uniq.append(x)
508
- seen.add(x)
509
- return uniq
510
 
511
  def _extract_all_keywords(text_norm: str) -> List[str]:
512
- """
513
- Deteksi SEMUA keyword dari THEME_KEYWORDS + $ticker.
514
- Tidak menghapus simbol '$' (sesuai permintaan).
515
- """
516
- # toleran untuk pencarian keyword tema (seperti semula)
517
- t = re.sub(r"\$([a-z0-9]+)", r"\1", text_norm, flags=re.I)
518
-
519
  found = []
520
- for kw in THEME_KEYWORDS:
521
- if re.search(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I):
522
- found.append(kw.lower())
523
-
524
- # gabungkan hasil $ticker
525
- tickers = _extract_tickers(text_norm)
526
- found.extend(tickers)
527
-
528
- # unik dengan urutan muncul pertama
529
- uniq = []
530
  seen = set()
 
531
  for kw in found:
532
  if kw not in seen:
533
  uniq.append(kw)
@@ -549,7 +375,9 @@ def _choose_dominant_keyword(text_norm: str, kws: List[str]) -> Optional[str]:
549
  return chosen
550
 
551
  def _role_of(chat_id: int) -> str:
552
- return chat_roles.get(chat_id, "core")
 
 
553
 
554
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
555
  """
@@ -559,12 +387,38 @@ def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
559
  bucket = keyword_group_last_seen.get(keyword, {})
560
  core_ids, sup_ids = set(), set()
561
  for gk in bucket.keys():
562
- role = chat_roles.get(int(gk), "core")
563
  (core_ids if role == "core" else sup_ids).add(gk)
564
  return len(core_ids), len(sup_ids)
565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
- async def process_message(msg, source_chat_id: int) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
568
  """
569
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
570
  agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
@@ -583,12 +437,16 @@ async def process_message(msg, source_chat_id: int) -> None:
583
  if ch in recent_content_hashes:
584
  debug_log("Content-duplicate (global), dilewati", orig_text)
585
  return
586
- recent_content_hashes.append(ch)
587
-
588
- # Dedup lama (per pesan/media)
589
- h = hash_for_dedup(text_norm, msg)
 
 
 
 
590
  if h in recent_hashes:
591
- debug_log("Duplikat (pesan/media), dilewati", orig_text)
592
  return
593
  recent_hashes.append(h)
594
 
@@ -612,22 +470,19 @@ async def process_message(msg, source_chat_id: int) -> None:
612
  now = datetime.now(timezone.utc)
613
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
614
 
615
- # Gating SUPPORT (CORE-anchored)
616
  if role == "support":
617
  core_u, sup_u = _unique_counts_by_role(main_kw)
618
- # Aturan:
619
- # - Jika sudah ada minimal 1 sebutan dari CORE untuk keyword ini -> izinkan.
620
- # - Jika belum ada anchor CORE, SUPPORT harus >= SUPPORT_MIN_UNIQUE.
621
- if core_u >= 1:
622
- pass
623
- elif sup_u < SUPPORT_MIN_UNIQUE:
624
- debug_log(f"Support ditahan (core_u={core_u}, sup_u={sup_u} < {SUPPORT_MIN_UNIQUE})", orig_text)
625
- return
626
 
627
- # Filter kalimat ajakan (whitelist-aware)
628
- cleaned_body = filter_invite_sentences(orig_text)
629
- if not cleaned_body.strip():
630
- debug_log("Semua kalimat terfilter (kosong), dilewati", orig_text)
631
  return
632
 
633
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
@@ -638,9 +493,21 @@ async def process_message(msg, source_chat_id: int) -> None:
638
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
639
  return
640
 
641
- await post_or_update(main_kw, cleaned_body, class_label, msg)
642
  debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
643
 
 
 
 
 
 
 
 
 
 
 
 
 
644
 
645
  async def backfill_history(entity, limit: int) -> None:
646
  if limit <= 0:
@@ -652,17 +519,6 @@ async def backfill_history(entity, limit: int) -> None:
652
  except Exception as e:
653
  debug_log("Error saat memproses backfill", str(e))
654
 
655
-
656
- # ========= Event handlers =========
657
- @client.on(events.NewMessage(chats=SOURCE_CHATS))
658
- async def on_new_message(event):
659
- try:
660
- await process_message(event.message, source_chat_id=event.chat_id)
661
- except Exception as e:
662
- print("Process error:", e)
663
-
664
-
665
- # ========= Entry points =========
666
  async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
667
  resolved = []
668
  for src in raw_list:
@@ -671,37 +527,16 @@ async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
671
  resolved.append(ent)
672
  chat_roles[int(ent.id)] = role_label
673
  except Exception as e:
674
- print(f"Gagal resolve sumber {src}: {e}")
675
  return resolved
676
 
677
- async def start_bot_background() -> None:
678
- await client.start()
679
- _init_db()
680
-
681
- # Load persisted state
682
- global last_posted, keyword_group_last_seen
683
- last_posted, keyword_group_last_seen = db_load_state()
684
-
685
- resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
686
- resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
687
- resolved_sources = resolved_core + resolved_support
688
-
689
- for ent in resolved_sources:
690
- try:
691
- await backfill_history(ent, INITIAL_BACKFILL)
692
- except Exception as e:
693
- print(f"Backfill gagal untuk {ent}: {e}")
694
-
695
- print("Kurator berjalan (background task). Menunggu pesan baru...")
696
- asyncio.create_task(client.run_until_disconnected())
697
-
698
- async def app_main() -> None:
699
  await client.start()
700
  _init_db()
 
701
 
702
- global last_posted, keyword_group_last_seen
703
- last_posted, keyword_group_last_seen = db_load_state()
704
-
705
  resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
706
  resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
707
  resolved_sources = resolved_core + resolved_support
@@ -712,6 +547,5 @@ async def app_main() -> None:
712
  print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
713
  await client.run_until_disconnected()
714
 
715
-
716
  if __name__ == "__main__":
717
  asyncio.run(app_main())
 
1
+ # botsignal.py (patched)
2
  import asyncio
3
  import os
4
  import re
 
15
  from telethon.sessions import StringSession, MemorySession
16
  from telethon.errors.rpcerrorlist import FloodWaitError
17
 
 
 
18
  API_ID = int(os.environ.get("API_ID", "0"))
19
  API_HASH = os.environ.get("API_HASH", "")
20
+ SESSION = os.environ.get("SESSION", "")
21
+ TARGET_CHAT = os.environ.get("TARGET_CHAT", "") # @username atau chat id
22
+ # Sumber
23
+ CORE_CHATS = [s.strip() for s in os.environ.get("CORE_CHATS", "").split(",") if s.strip()]
24
+ SUPPORT_CHATS = [s.strip() for s in os.environ.get("SUPPORT_CHATS", "").split(",") if s.strip()]
25
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  SOURCE_CHATS = CORE_CHATS + SUPPORT_CHATS
27
 
28
+ DB_PATH = os.environ.get("DB_PATH", "botsignal.db")
29
+
30
+ # ====== Tokenisasi / relevansi ======
31
+ def _tokenize_words(s: str) -> List[str]:
32
+ return re.findall(r"[a-zA-Z0-9\$\#]{2,}", s or "")
33
+
34
+ def normalize_for_filter(s: str) -> str:
35
+ if not s:
36
+ return ""
37
+ t = s
38
+ # netralkan alamat kontrak (sol/eth panjang)
39
+ t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
40
+ # hapus url/mention
41
+ t = _strip_urls_and_mentions(t)
42
+ return t
43
+
44
+ URL_REGEX = re.compile(r"(https?:\/\/\S+)", re.IGNORECASE)
45
+ MENTION_REGEX = re.compile(r"@\w+", re.IGNORECASE)
46
+
47
+ def _strip_urls_and_mentions(s: str) -> str:
48
+ s = URL_REGEX.sub("", s)
49
+ s = MENTION_REGEX.sub("", s)
50
+ return s
51
 
 
52
  THEME_KEYWORDS = [
53
  "call", "signal", "entry", "buy", "sell", "tp", "sl",
54
  "pump", "spot", "futures", "setup",
 
72
 
73
  CLASS_WINDOW_MINUTES = int(os.environ.get("CLASS_WINDOW_MINUTES", "10"))
74
 
75
+ # Dulu dipakai untuk bypass support-only; sekarang hanya aktif bila ALLOW_SUPPORT_SOLO=1
76
  SUPPORT_MIN_UNIQUE = int(os.environ.get("SUPPORT_MIN_UNIQUE", "2"))
77
 
78
  # New: DRY RUN (tidak kirim apa pun ke TARGET_CHAT)
 
81
  # Backfill buffer: abaikan pesan lebih tua dari (startup_time - buffer)
82
  BACKFILL_BUFFER_MINUTES = int(os.environ.get("BACKFILL_BUFFER_MINUTES", "3"))
83
 
84
+ # >>> Tambahan ENV untuk mode support-only (default OFF, strict core-anchored) <<<
85
+ ALLOW_SUPPORT_SOLO = os.environ.get("ALLOW_SUPPORT_SOLO", "0") == "1"
86
+ SUPPORT_SOLO_MIN_UNIQUE = int(os.environ.get("SUPPORT_SOLO_MIN_UNIQUE", "99"))
87
 
88
+ # ====== Client ======
89
+ if SESSION:
90
+ session = StringSession(SESSION)
91
+ else:
92
+ session = MemorySession()
 
 
93
 
94
+ client = TelegramClient(session, API_ID, API_HASH)
 
 
95
 
96
+ # ====== State ======
 
97
  startup_time_utc = datetime.now(timezone.utc)
98
 
99
+ recent_hashes = deque(maxlen=DEDUP_BUFFER_SIZE)
100
+ recent_content_hashes = set()
101
 
102
+ # chat_roles: chat_id -> "core"/"support"
103
+ chat_roles: Dict[int, str] = {}
104
+
105
+ # agregasi keyword -> group_id -> last_seen_utc
106
+ keyword_group_last_seen: defaultdict[str, dict[str, datetime]] = defaultdict(dict)
107
 
108
  def _db():
109
  conn = sqlite3.connect(DB_PATH)
110
  conn.execute("PRAGMA journal_mode=WAL;")
111
+ conn.execute("PRAGMA synchronous=NORMAL;")
112
  return conn
113
 
114
  def _init_db():
 
141
  conn.close()
142
  return last, kw_map
143
 
144
+ def db_save_group_seen(keyword: str, group_key: str, ts: datetime):
 
 
 
 
 
 
 
 
145
  conn = _db()
146
+ conn.execute(
147
+ "INSERT OR REPLACE INTO kw_group_seen(keyword, group_key, last_ts) VALUES (?, ?, ?)",
148
+ (keyword, group_key, int(ts.timestamp())),
149
+ )
150
  conn.commit()
151
  conn.close()
152
 
153
+ def db_save_last_posted(keyword: str, msg_id: int, tier: str):
154
  conn = _db()
155
+ conn.execute(
156
+ "INSERT OR REPLACE INTO last_posted(keyword, msg_id, tier) VALUES (?, ?, ?)",
157
+ (keyword, msg_id, tier),
158
+ )
159
  conn.commit()
160
  conn.close()
161
 
162
+ def score_relevance(t: str, keywords: List[str]) -> float:
163
+ # exact: jumlah kemunculan kw
164
+ exact_hits = sum(
165
+ len(re.findall(rf"(^|\W){re.escape(kw)}(\W|$)", t, flags=re.I)) for kw in keywords
166
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  exact_score = exact_hits * KEYWORD_WEIGHT
168
 
169
  # fuzzy windowed: ambil top-3 skor di antara jendela 20 token
 
195
  if doc and getattr(doc, "id", None) is not None:
196
  parts.append(f"doc:{doc.id}")
197
  if getattr(msg, "photo", None) is not None:
198
+ parts.append("has_photo")
199
+ h = hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
200
+ return h
 
 
 
201
 
202
  def content_only_hash(text: str) -> str:
203
+ t = text or ""
204
+ # netralkan angka/CA/url/mention agar “teks ajakan” sama ke-dedup
205
+ t = re.sub(r"\b\d{2,}\b", "NUM", t)
206
+ t = re.sub(r"\b[1-9A-HJ-NP-Za-km-z]{25,}\b", "CA", t)
207
+ t = _strip_urls_and_mentions(t)
208
+ return hashlib.sha256(t.encode("utf-8")).hexdigest()
209
+
210
+ def _windows(tokens: List[str], k: int):
211
+ for i in range(0, len(tokens), max(1, k // 2)):
212
+ yield " ".join(tokens[i:i + k])
213
+
214
+ # ====== Media helpers ======
215
  def is_image_message(msg) -> bool:
216
  if getattr(msg, "photo", None) is not None:
217
  return True
218
  doc = getattr(msg, "document", None)
219
+ if not doc:
220
+ return False
221
+ mt = (getattr(doc, "mime_type", "") or "").lower()
222
+ if mt.startswith("image/"):
223
+ return True
224
+ if not ALLOW_GIFS_VIDEOS and (mt.startswith("video/") or mt == "image/gif"):
225
+ return False
226
+ return True
 
 
 
227
 
228
  def media_too_big(msg) -> bool:
229
  doc = getattr(msg, "document", None)
230
+ if not doc:
231
+ return False
232
+ size = getattr(doc, "size", None)
233
+ if size is None:
234
+ return False
235
+ return (size / (1024 * 1024)) > MAX_MEDIA_MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ # ====== Ajakan filter ======
238
  INVITE_PATTERNS = [
239
+ r"\bjoin\b", r"\bdm\b", r"\binbox\b", r"\bpm\b", r"\bvip\b",
240
+ r"\bcontact\b", r"\bpromo\b", r"\bpaid\b", r"@",
241
+ r"t\.me\/", r"telegram\.me\/", r"\blink\b", r"\bklik\b", r"\bhubungi\b",
 
 
 
 
 
 
242
  r"\bpromo\b", r"\bpromosi\b", r"\biklan\b",
243
+ r"\badvert\b", r"\badvertise\b", r"\badvertisement\b"
244
+
245
  ]
246
  INVITE_REGEXES = [re.compile(p, re.IGNORECASE) for p in INVITE_PATTERNS]
247
 
 
257
  t = s.strip()
258
  if not t:
259
  return False
260
+ # (ASAL) Jika kalimat memuat sinyal kuat, jangan dibuang walau ada kata invite
261
  if any(r.search(t) for r in WHITELIST_REGEXES):
262
  return False
263
  # Jika ada 1+ pola ajakan, buang
 
272
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
273
  return cleaned
274
 
275
+ # ====== Posting ======
276
+ async def post_or_update(keyword: str, body: str, class_label: str, msg):
277
+ """
278
+ Post baru jika belum ada, jika sudah ada posting untuk keyword tsb → edit.
279
+ """
280
+ # muat last posted dari DB
281
+ last_posted, _ = db_load_state()
282
+ last = last_posted.get(keyword)
283
 
 
 
 
 
 
284
  if DRY_RUN:
285
+ print(f"[DRY_RUN] ({class_label}) [{keyword}] {body[:160]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  return
287
 
288
+ if not last:
289
+ # post baru
290
+ sent = await client.send_message(TARGET_CHAT, body)
291
+ db_save_last_posted(keyword, sent.id, class_label)
292
+ return
293
+ else:
294
+ # edit posting lama
295
+ last_msg_id = last["msg_id"]
296
  try:
297
+ await client.edit_message(TARGET_CHAT, last_msg_id, body)
298
+ db_save_last_posted(keyword, last_msg_id, class_label)
 
 
 
 
 
299
  except Exception as e:
300
+ print("Edit failed, posting baru:", e)
301
+ sent = await client.send_message(TARGET_CHAT, body)
302
+ db_save_last_posted(keyword, sent.id, class_label)
303
+ return
 
 
 
 
304
 
305
  # ========= Core actions (fallback kept) =========
306
  async def send_as_is(msg, text_override: Optional[str] = None) -> None:
 
328
  ext = ".jpg"
329
  mt = (getattr(doc, "mime_type", "") or "").lower()
330
  if mt:
331
+ guessed = guess_extension(mt) or ""
332
+ if guessed:
333
+ ext = guessed
 
334
  bio.name = f"media{ext}"
335
  await client.send_file(TARGET_CHAT, bio, caption=orig_text, caption_entities=entities, force_document=False)
336
  return
337
+ except FloodWaitError as fw:
338
+ print("FloodWait:", fw)
339
  except Exception as e:
340
+ print("send_file err:", e)
 
 
 
 
 
 
341
 
342
+ await client.send_message(TARGET_CHAT, orig_text)
343
 
344
+ # ====== Keyword extraction / aggregation ======
345
+ TICKER_RE = re.compile(r"\$[a-z0-9]{2,10}", re.IGNORECASE)
346
+ COINWORDS = ["btc","eth","sol","bnb","pepe","doge","meme","spot","futures","pump","entry","buy","sell","tp","sl","setup"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  def _extract_all_keywords(text_norm: str) -> List[str]:
 
 
 
 
 
 
 
349
  found = []
350
+ found += [m.group(0).lower() for m in TICKER_RE.finditer(text_norm)]
351
+ for w in COINWORDS:
352
+ if re.search(rf"(^|\W){re.escape(w)}(\W|$)", text_norm, flags=re.I):
353
+ found.append(w)
354
+ # unik, preserve order
 
 
 
 
 
355
  seen = set()
356
+ uniq = []
357
  for kw in found:
358
  if kw not in seen:
359
  uniq.append(kw)
 
375
  return chosen
376
 
377
  def _role_of(chat_id: int) -> str:
378
+ # DULU: default ke "core" → menyebabkan tembus saat resolve gagal.
379
+ # BARU: default ke "support" (atau treat unknown as support)
380
+ return chat_roles.get(chat_id, "support")
381
 
382
  def _unique_counts_by_role(keyword: str) -> Tuple[int, int]:
383
  """
 
387
  bucket = keyword_group_last_seen.get(keyword, {})
388
  core_ids, sup_ids = set(), set()
389
  for gk in bucket.keys():
390
+ role = chat_roles.get(int(gk), "support")
391
  (core_ids if role == "core" else sup_ids).add(gk)
392
  return len(core_ids), len(sup_ids)
393
 
394
+ def update_and_classify(keyword: str, group_key: str, ts: datetime) -> Tuple[str, int]:
395
+ """
396
+ Simpan last_seen dan kembalikan label kelas (HOT/SEDANG/BARU) + jumlah grup unik.
397
+ """
398
+ # purge window kadaluarsa
399
+ cutoff = ts - timedelta(minutes=CLASS_WINDOW_MINUTES)
400
+ bucket = keyword_group_last_seen[keyword]
401
+ expired = [g for g, t in bucket.items() if t < cutoff]
402
+ for g in expired:
403
+ del bucket[g]
404
+
405
+ # update
406
+ bucket[group_key] = ts
407
+ db_save_group_seen(keyword, group_key, ts)
408
 
409
+ # hitung grup unik
410
+ unique_groups = len(bucket)
411
+
412
+ # simple classification
413
+ if unique_groups >= 4:
414
+ label = "🔥 HOT"
415
+ elif unique_groups >= 2:
416
+ label = "☀️ SEDANG"
417
+ else:
418
+ label = "🌱 BARU"
419
+ return label, unique_groups
420
+
421
+ def process_message(msg, source_chat_id: int) -> None:
422
  """
423
  Filter, content-dedup, relevansi, multi-kw -> pilih dominan,
424
  agregasi tier, gating support (CORE-anchored), filter ajakan, dan POST/EDIT.
 
437
  if ch in recent_content_hashes:
438
  debug_log("Content-duplicate (global), dilewati", orig_text)
439
  return
440
+ recent_content_hashes.add(ch)
441
+ if len(recent_content_hashes) > DEDUP_BUFFER_SIZE * 2:
442
+ # jaga ukuran set
443
+ recent_content_hashes.clear()
444
+ recent_content_hashes.add(ch)
445
+
446
+ # Hash per pesan (teks+media) untuk guard cepat
447
+ h = hash_for_dedup(orig_text, msg)
448
  if h in recent_hashes:
449
+ debug_log("Duplicate (hash recent), dilewati", orig_text)
450
  return
451
  recent_hashes.append(h)
452
 
 
470
  now = datetime.now(timezone.utc)
471
  class_label, unique_groups = update_and_classify(main_kw, group_key, now)
472
 
473
+ # Gating SUPPORT (strict core-anchored unless env allows 'solo')
474
  if role == "support":
475
  core_u, sup_u = _unique_counts_by_role(main_kw)
476
+ if core_u < 1:
477
+ # if solo support not allowed or not strong enough, hold
478
+ if (not ALLOW_SUPPORT_SOLO) or (sup_u < SUPPORT_SOLO_MIN_UNIQUE):
479
+ debug_log(f"Support ditahan (butuh anchor core; core_u={core_u}, sup_u={sup_u})", orig_text)
480
+ return
 
 
 
481
 
482
+ # Filter ajakan per-kalimat
483
+ cleaned_body = filter_invite_sentences(orig_text).strip()
484
+ if not cleaned_body:
485
+ debug_log("Habis difilter, kosong dilewati", orig_text)
486
  return
487
 
488
  # Backfill safety: saat startup, hindari pesan yang terlalu lama
 
493
  debug_log("Lama (lewat cutoff backfill safety), dilewati", orig_text)
494
  return
495
 
496
+ asyncio.create_task(post_or_update(main_kw, cleaned_body, class_label, msg))
497
  debug_log(f"Posted/Edited (role={role}, unique_groups={unique_groups}, kw={main_kw}, tier={class_label})", orig_text)
498
 
499
+ # ========= Event handlers =========
500
+ @client.on(events.NewMessage(chats=SOURCE_CHATS))
501
+ async def on_new_message(event):
502
+ try:
503
+ await process_message(event.message, source_chat_id=event.chat_id)
504
+ except Exception as e:
505
+ print("Process error:", e)
506
+
507
+ # ========= Utilities =========
508
+ def debug_log(tag: str, body: str):
509
+ ts = datetime.now().strftime("%H:%M:%S")
510
+ print(f"[{ts}] {tag}: {body[:200].replace(chr(10),' / ')}")
511
 
512
  async def backfill_history(entity, limit: int) -> None:
513
  if limit <= 0:
 
519
  except Exception as e:
520
  debug_log("Error saat memproses backfill", str(e))
521
 
 
 
 
 
 
 
 
 
 
 
 
522
  async def _resolve_and_tag_chats(raw_list, role_label: str) -> list:
523
  resolved = []
524
  for src in raw_list:
 
527
  resolved.append(ent)
528
  chat_roles[int(ent.id)] = role_label
529
  except Exception as e:
530
+ print(f"Gagal resolve {src}: {e}")
531
  return resolved
532
 
533
+ # ========= Entry points =========
534
+ async def app_main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  await client.start()
536
  _init_db()
537
+ _ = db_load_state() # pre-load caches
538
 
539
+ # resolve semua chats dan tag role
 
 
540
  resolved_core = await _resolve_and_tag_chats(CORE_CHATS, "core")
541
  resolved_support = await _resolve_and_tag_chats(SUPPORT_CHATS, "support")
542
  resolved_sources = resolved_core + resolved_support
 
547
  print("Kurator berjalan. Menunggu pesan baru... (Stop dengan interrupt).")
548
  await client.run_until_disconnected()
549
 
 
550
  if __name__ == "__main__":
551
  asyncio.run(app_main())