Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

QUALITY_SCORE_ARCHITECTURE.md +5 -4
log.log +2 -2
models/vocabulary.py +7 -10
scripts/analyze_distribution.py +7 -9
scripts/compute_quality_score.py +61 -11

QUALITY_SCORE_ARCHITECTURE.md CHANGED Viewed

@@ -37,10 +37,11 @@ Let `R_max` be the token's lifetime max return multiple (e.g., ATH / launch).
 Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
 - B0: `R_max < 3`
-- B1: `3 <= R_max < 10`
-- B2: `10 <= R_max < 20`
-- B3: `20 <= R_max < 100`
-- B4: `100 <= R_max < 10_000`
 Notes:
 - If a bucket has too few samples, merge with a neighbor.

 Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
 - B0: `R_max < 3`
+- B1: `3 <= R_max < 5`
+- B2: `5 <= R_max < 10`
+- B3: `10 <= R_max < 20`
+- B4: `20 <= R_max < 100`
+- B5: `100 <= R_max < 10_000`
 Notes:
 - If a bucket has too few samples, merge with a neighbor.

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7e8559fa0dfc6a9356d4078d582a479a5a3cbf8a3348183b3baf336ef73db25
-size 2302

 version https://git-lfs.github.com/spec/v1
+oid sha256:bbb4125009f74179d4f414c0145accbc2cbf3558be6b4d94a850241dd56aaab2
+size 5084

models/vocabulary.py CHANGED Viewed

@@ -187,13 +187,10 @@ EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
 ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
 NUM_EXCHANGES = len(EXCHANGES)
-# --- NEW: Return Class Thresholds ---
-# Class 0: 0 - 3x
-# Class 1: 3 - 10x
-# Class 2: 10 - 20x
-# Class 3: 20 - 100x
-# Class 4: 100 - 10,000x
-RETURN_THRESHOLDS = [0, 3, 10, 20, 100, 10000]
-# Class 5: Manipulated (High return but suspicious metrics)
-MANIPULATED_CLASS_ID = 5

 ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
 NUM_EXCHANGES = len(EXCHANGES)
+# Return buckets used across analysis/scoring scripts.
+# Split 3x-10x into 3x-5x and 5x-10x to reduce within-bucket heterogeneity.
+RETURN_THRESHOLDS = [0, 3, 5, 10, 20, 100, 10000]
+NUM_RETURN_CLASSES = len(RETURN_THRESHOLDS) - 1
+# Manipulated (High return but suspicious metrics). Keep this as "after all return buckets".
+MANIPULATED_CLASS_ID = NUM_RETURN_CLASSES

scripts/analyze_distribution.py CHANGED Viewed

@@ -224,15 +224,13 @@ def analyze():
             segments_tokens[c] = []
         segments_tokens[c].append(t)
-    # Define Labels
-    labels = {
-        0: "0. Garbage (< 3x)",
-        1: "1. Profitable (3x-10x)",
-        2: "2. Good (10x-20x)",
-        3: "3. Hyped (20x-100x)",
-        4: "4. PVE (100x-10kx)",
-        MANIPULATED_CLASS_ID: "5. MANIPULATED (Fake Metrics)"
-    }
     # Common SQL parts
     # We need a robust base for the WHERE clause variables (fees, vol, holders)

             segments_tokens[c] = []
         segments_tokens[c].append(t)
+    # Define Labels from thresholds so bucket changes don't silently desync output.
+    labels = {}
+    for i in range(len(thresholds) - 1):
+        lo = thresholds[i]
+        hi = thresholds[i + 1]
+        labels[i] = f"{i}. {lo}x - {hi}x"
+    labels[MANIPULATED_CLASS_ID] = f"{MANIPULATED_CLASS_ID}. MANIPULATED (Fake Metrics)"
     # Common SQL parts
     # We need a robust base for the WHERE clause variables (fees, vol, holders)

scripts/compute_quality_score.py CHANGED Viewed

@@ -177,25 +177,22 @@ def fetch_token_metrics(client) -> List[dict]:
                 wh.mint_address AS token_address,
                 (sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
             FROM (
-                SELECT mint_address, wallet_address, argMax(current_balance, updated_at) AS current_balance
                 FROM wallet_holdings
                 GROUP BY mint_address, wallet_address
             ) wh
-            JOIN (
-                SELECT
-                    wallet_address,
-                    argMax(total_buys_count, updated_at) AS buys,
-                    argMax(transfers_in_count, updated_at) AS transfers,
-                    argMax(spl_transfers_in_count, updated_at) AS spl_transfers
-                FROM wallet_profile_metrics
-                GROUP BY wallet_address
-            ) wpm ON wh.wallet_address = wpm.wallet_address
             JOIN (
                 SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
                 FROM tokens
                 GROUP BY token_address
             ) t ON wh.mint_address = t.token_address
-            WHERE wpm.buys = 0 AND (wpm.transfers > 0 OR wpm.spl_transfers > 0) AND t.total_supply > 0
             GROUP BY wh.mint_address, t.total_supply, t.decimals
         )
     SELECT
@@ -269,6 +266,10 @@ def _compute_quality_scores(
             "q_raw": [],
             "feature_pairs": {f[0]: [] for f in feature_defs},
             "raw_pairs": {m: [] for m in raw_metrics},
         }
     # Build bucket mapping
@@ -286,6 +287,18 @@ def _compute_quality_scores(
     # Compute percentiles per bucket + feature
     token_scores = []
     for b, items in buckets.items():
         # Precompute percentiles per feature
         feature_percentiles: Dict[str, Dict[str, float]] = {}
         for fname, fget, _pos in feature_defs:
@@ -515,6 +528,43 @@ def print_diagnostics(debug: dict) -> None:
             high_mean = sum(highs) / len(highs)
             print(f"  {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
 def main():
     parser = argparse.ArgumentParser(description="Compute token quality/health score.")

                 wh.mint_address AS token_address,
                 (sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
             FROM (
+                SELECT
+                    mint_address,
+                    wallet_address,
+                    argMax(current_balance, updated_at) AS current_balance,
+                    argMax(history_transfer_in, updated_at) AS history_transfer_in
                 FROM wallet_holdings
                 GROUP BY mint_address, wallet_address
             ) wh
             JOIN (
                 SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
                 FROM tokens
                 GROUP BY token_address
             ) t ON wh.mint_address = t.token_address
+            -- Mint-specific "insiders": wallets that received the token via transfer-in at least once.
+            -- (Intentionally ignores whether they also bought; per request.)
+            WHERE wh.history_transfer_in > 0 AND t.total_supply > 0
             GROUP BY wh.mint_address, t.total_supply, t.decimals
         )
     SELECT
             "q_raw": [],
             "feature_pairs": {f[0]: [] for f in feature_defs},
             "raw_pairs": {m: [] for m in raw_metrics},
+            # For checking assumptions like "higher return buckets have lower bundled_pct".
+            # Store raw metric distributions per return bucket and (ret, metric) pairs overall.
+            "bucket_raw": {},  # bucket_id -> metric -> [raw vals]
+            "ret_pairs": {m: [] for m in raw_metrics},  # metric -> [(ret, raw_val)]
         }
     # Build bucket mapping
     # Compute percentiles per bucket + feature
     token_scores = []
     for b, items in buckets.items():
+        if with_debug:
+            debug["bucket_raw"].setdefault(b, {m: [] for m in raw_metrics})
+            for d in items:
+                ret_val = d.get("ret")
+                for metric in raw_metrics:
+                    raw_val = d.get(metric)
+                    if raw_val is None:
+                        continue
+                    debug["bucket_raw"][b][metric].append(raw_val)
+                    if ret_val is not None:
+                        debug["ret_pairs"][metric].append((ret_val, raw_val))
         # Precompute percentiles per feature
         feature_percentiles: Dict[str, Dict[str, float]] = {}
         for fname, fget, _pos in feature_defs:
             high_mean = sum(highs) / len(highs)
             print(f"  {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
+    # Return bucket -> raw metric distributions (answers questions like "do higher-return tokens bundle less?")
+    bucket_raw = debug.get("bucket_raw", {})
+    if bucket_raw:
+        print("\n=== RETURN BUCKET RAW METRICS ===")
+        for b in sorted(bucket_raw.keys()):
+            print(f"\nSEGMENT: {b}. {_bucket_label(b)}")
+            for metric in sorted(bucket_raw[b].keys()):
+                vals = [v for v in bucket_raw[b][metric] if v is not None]
+                if not vals:
+                    continue
+                stats = _summary_stats(vals)
+                # Also report how often the metric is > 0 (useful since many pct metrics are 0).
+                nz = sum(1 for v in vals if v > 0)
+                nz_rate = nz / len(vals)
+                print(
+                    f"  {metric}: mean={stats['mean']:.4f} p50={stats['p50']:.4f} "
+                    f"p90={stats['p90']:.4f} p99={stats['p99']:.4f} nonzero_rate={nz_rate:.3f} (n={len(vals)})"
+                )
+    # Overall return-vs-metric correlation (not bucketed). Use log(ret) to reduce tail leverage.
+    ret_pairs = debug.get("ret_pairs", {})
+    if ret_pairs:
+        print("\n=== RETURN VS RAW METRICS (GLOBAL) ===")
+        for metric in sorted(ret_pairs.keys()):
+            pairs = ret_pairs[metric]
+            xs = []
+            ys = []
+            for r, v in pairs:
+                if r is None or r <= 0:
+                    continue
+                xs.append(math.log(r))
+                ys.append(v)
+            if len(xs) < 3:
+                continue
+            corr = _pearson_corr(xs, ys)
+            print(f"  log(ret) vs {metric}: {corr:.4f} (n={len(xs)})")
 def main():
     parser = argparse.ArgumentParser(description="Compute token quality/health score.")