zirobtc commited on
Commit
7189688
·
1 Parent(s): 3759011

Upload folder using huggingface_hub

Browse files
QUALITY_SCORE_ARCHITECTURE.md CHANGED
@@ -37,10 +37,11 @@ Let `R_max` be the token's lifetime max return multiple (e.g., ATH / launch).
37
 
38
  Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
39
  - B0: `R_max < 3`
40
- - B1: `3 <= R_max < 10`
41
- - B2: `10 <= R_max < 20`
42
- - B3: `20 <= R_max < 100`
43
- - B4: `100 <= R_max < 10_000`
 
44
 
45
  Notes:
46
  - If a bucket has too few samples, merge with a neighbor.
 
37
 
38
  Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
39
  - B0: `R_max < 3`
40
+ - B1: `3 <= R_max < 5`
41
+ - B2: `5 <= R_max < 10`
42
+ - B3: `10 <= R_max < 20`
43
+ - B4: `20 <= R_max < 100`
44
+ - B5: `100 <= R_max < 10_000`
45
 
46
  Notes:
47
  - If a bucket has too few samples, merge with a neighbor.
log.log CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7e8559fa0dfc6a9356d4078d582a479a5a3cbf8a3348183b3baf336ef73db25
3
- size 2302
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbb4125009f74179d4f414c0145accbc2cbf3558be6b4d94a850241dd56aaab2
3
+ size 5084
models/vocabulary.py CHANGED
@@ -187,13 +187,10 @@ EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
187
  ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
188
  NUM_EXCHANGES = len(EXCHANGES)
189
 
190
- # --- NEW: Return Class Thresholds ---
191
- # Class 0: 0 - 3x
192
- # Class 1: 3 - 10x
193
- # Class 2: 10 - 20x
194
- # Class 3: 20 - 100x
195
- # Class 4: 100 - 10,000x
196
- RETURN_THRESHOLDS = [0, 3, 10, 20, 100, 10000]
197
-
198
- # Class 5: Manipulated (High return but suspicious metrics)
199
- MANIPULATED_CLASS_ID = 5
 
187
  ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
188
  NUM_EXCHANGES = len(EXCHANGES)
189
 
190
+ # Return buckets used across analysis/scoring scripts.
191
+ # Split 3x-10x into 3x-5x and 5x-10x to reduce within-bucket heterogeneity.
192
+ RETURN_THRESHOLDS = [0, 3, 5, 10, 20, 100, 10000]
193
+ NUM_RETURN_CLASSES = len(RETURN_THRESHOLDS) - 1
194
+
195
+ # Manipulated (High return but suspicious metrics). Keep this as "after all return buckets".
196
+ MANIPULATED_CLASS_ID = NUM_RETURN_CLASSES
 
 
 
scripts/analyze_distribution.py CHANGED
@@ -224,15 +224,13 @@ def analyze():
224
  segments_tokens[c] = []
225
  segments_tokens[c].append(t)
226
 
227
- # Define Labels
228
- labels = {
229
- 0: "0. Garbage (< 3x)",
230
- 1: "1. Profitable (3x-10x)",
231
- 2: "2. Good (10x-20x)",
232
- 3: "3. Hyped (20x-100x)",
233
- 4: "4. PVE (100x-10kx)",
234
- MANIPULATED_CLASS_ID: "5. MANIPULATED (Fake Metrics)"
235
- }
236
 
237
  # Common SQL parts
238
  # We need a robust base for the WHERE clause variables (fees, vol, holders)
 
224
  segments_tokens[c] = []
225
  segments_tokens[c].append(t)
226
 
227
+ # Define Labels from thresholds so bucket changes don't silently desync output.
228
+ labels = {}
229
+ for i in range(len(thresholds) - 1):
230
+ lo = thresholds[i]
231
+ hi = thresholds[i + 1]
232
+ labels[i] = f"{i}. {lo}x - {hi}x"
233
+ labels[MANIPULATED_CLASS_ID] = f"{MANIPULATED_CLASS_ID}. MANIPULATED (Fake Metrics)"
 
 
234
 
235
  # Common SQL parts
236
  # We need a robust base for the WHERE clause variables (fees, vol, holders)
scripts/compute_quality_score.py CHANGED
@@ -177,25 +177,22 @@ def fetch_token_metrics(client) -> List[dict]:
177
  wh.mint_address AS token_address,
178
  (sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
179
  FROM (
180
- SELECT mint_address, wallet_address, argMax(current_balance, updated_at) AS current_balance
 
 
 
 
181
  FROM wallet_holdings
182
  GROUP BY mint_address, wallet_address
183
  ) wh
184
- JOIN (
185
- SELECT
186
- wallet_address,
187
- argMax(total_buys_count, updated_at) AS buys,
188
- argMax(transfers_in_count, updated_at) AS transfers,
189
- argMax(spl_transfers_in_count, updated_at) AS spl_transfers
190
- FROM wallet_profile_metrics
191
- GROUP BY wallet_address
192
- ) wpm ON wh.wallet_address = wpm.wallet_address
193
  JOIN (
194
  SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
195
  FROM tokens
196
  GROUP BY token_address
197
  ) t ON wh.mint_address = t.token_address
198
- WHERE wpm.buys = 0 AND (wpm.transfers > 0 OR wpm.spl_transfers > 0) AND t.total_supply > 0
 
 
199
  GROUP BY wh.mint_address, t.total_supply, t.decimals
200
  )
201
  SELECT
@@ -269,6 +266,10 @@ def _compute_quality_scores(
269
  "q_raw": [],
270
  "feature_pairs": {f[0]: [] for f in feature_defs},
271
  "raw_pairs": {m: [] for m in raw_metrics},
 
 
 
 
272
  }
273
 
274
  # Build bucket mapping
@@ -286,6 +287,18 @@ def _compute_quality_scores(
286
  # Compute percentiles per bucket + feature
287
  token_scores = []
288
  for b, items in buckets.items():
 
 
 
 
 
 
 
 
 
 
 
 
289
  # Precompute percentiles per feature
290
  feature_percentiles: Dict[str, Dict[str, float]] = {}
291
  for fname, fget, _pos in feature_defs:
@@ -515,6 +528,43 @@ def print_diagnostics(debug: dict) -> None:
515
  high_mean = sum(highs) / len(highs)
516
  print(f" {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  def main():
520
  parser = argparse.ArgumentParser(description="Compute token quality/health score.")
 
177
  wh.mint_address AS token_address,
178
  (sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
179
  FROM (
180
+ SELECT
181
+ mint_address,
182
+ wallet_address,
183
+ argMax(current_balance, updated_at) AS current_balance,
184
+ argMax(history_transfer_in, updated_at) AS history_transfer_in
185
  FROM wallet_holdings
186
  GROUP BY mint_address, wallet_address
187
  ) wh
 
 
 
 
 
 
 
 
 
188
  JOIN (
189
  SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
190
  FROM tokens
191
  GROUP BY token_address
192
  ) t ON wh.mint_address = t.token_address
193
+ -- Mint-specific "insiders": wallets that received the token via transfer-in at least once.
194
+ -- (Intentionally ignores whether they also bought; per request.)
195
+ WHERE wh.history_transfer_in > 0 AND t.total_supply > 0
196
  GROUP BY wh.mint_address, t.total_supply, t.decimals
197
  )
198
  SELECT
 
266
  "q_raw": [],
267
  "feature_pairs": {f[0]: [] for f in feature_defs},
268
  "raw_pairs": {m: [] for m in raw_metrics},
269
+ # For checking assumptions like "higher return buckets have lower bundled_pct".
270
+ # Store raw metric distributions per return bucket and (ret, metric) pairs overall.
271
+ "bucket_raw": {}, # bucket_id -> metric -> [raw vals]
272
+ "ret_pairs": {m: [] for m in raw_metrics}, # metric -> [(ret, raw_val)]
273
  }
274
 
275
  # Build bucket mapping
 
287
  # Compute percentiles per bucket + feature
288
  token_scores = []
289
  for b, items in buckets.items():
290
+ if with_debug:
291
+ debug["bucket_raw"].setdefault(b, {m: [] for m in raw_metrics})
292
+ for d in items:
293
+ ret_val = d.get("ret")
294
+ for metric in raw_metrics:
295
+ raw_val = d.get(metric)
296
+ if raw_val is None:
297
+ continue
298
+ debug["bucket_raw"][b][metric].append(raw_val)
299
+ if ret_val is not None:
300
+ debug["ret_pairs"][metric].append((ret_val, raw_val))
301
+
302
  # Precompute percentiles per feature
303
  feature_percentiles: Dict[str, Dict[str, float]] = {}
304
  for fname, fget, _pos in feature_defs:
 
528
  high_mean = sum(highs) / len(highs)
529
  print(f" {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
530
 
531
+ # Return bucket -> raw metric distributions (answers questions like "do higher-return tokens bundle less?")
532
+ bucket_raw = debug.get("bucket_raw", {})
533
+ if bucket_raw:
534
+ print("\n=== RETURN BUCKET RAW METRICS ===")
535
+ for b in sorted(bucket_raw.keys()):
536
+ print(f"\nSEGMENT: {b}. {_bucket_label(b)}")
537
+ for metric in sorted(bucket_raw[b].keys()):
538
+ vals = [v for v in bucket_raw[b][metric] if v is not None]
539
+ if not vals:
540
+ continue
541
+ stats = _summary_stats(vals)
542
+ # Also report how often the metric is > 0 (useful since many pct metrics are 0).
543
+ nz = sum(1 for v in vals if v > 0)
544
+ nz_rate = nz / len(vals)
545
+ print(
546
+ f" {metric}: mean={stats['mean']:.4f} p50={stats['p50']:.4f} "
547
+ f"p90={stats['p90']:.4f} p99={stats['p99']:.4f} nonzero_rate={nz_rate:.3f} (n={len(vals)})"
548
+ )
549
+
550
+ # Overall return-vs-metric correlation (not bucketed). Use log(ret) to reduce tail leverage.
551
+ ret_pairs = debug.get("ret_pairs", {})
552
+ if ret_pairs:
553
+ print("\n=== RETURN VS RAW METRICS (GLOBAL) ===")
554
+ for metric in sorted(ret_pairs.keys()):
555
+ pairs = ret_pairs[metric]
556
+ xs = []
557
+ ys = []
558
+ for r, v in pairs:
559
+ if r is None or r <= 0:
560
+ continue
561
+ xs.append(math.log(r))
562
+ ys.append(v)
563
+ if len(xs) < 3:
564
+ continue
565
+ corr = _pearson_corr(xs, ys)
566
+ print(f" log(ret) vs {metric}: {corr:.4f} (n={len(xs)})")
567
+
568
 
569
  def main():
570
  parser = argparse.ArgumentParser(description="Compute token quality/health score.")