Upload folder using huggingface_hub
Browse files- QUALITY_SCORE_ARCHITECTURE.md +5 -4
- log.log +2 -2
- models/vocabulary.py +7 -10
- scripts/analyze_distribution.py +7 -9
- scripts/compute_quality_score.py +61 -11
QUALITY_SCORE_ARCHITECTURE.md
CHANGED
|
@@ -37,10 +37,11 @@ Let `R_max` be the token's lifetime max return multiple (e.g., ATH / launch).
|
|
| 37 |
|
| 38 |
Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
|
| 39 |
- B0: `R_max < 3`
|
| 40 |
-
- B1: `3 <= R_max <
|
| 41 |
-
- B2: `
|
| 42 |
-
- B3: `
|
| 43 |
-
- B4: `
|
|
|
|
| 44 |
|
| 45 |
Notes:
|
| 46 |
- If a bucket has too few samples, merge with a neighbor.
|
|
|
|
| 37 |
|
| 38 |
Use coarse buckets for the bulk and finer buckets for the tail, e.g.:
|
| 39 |
- B0: `R_max < 3`
|
| 40 |
+
- B1: `3 <= R_max < 5`
|
| 41 |
+
- B2: `5 <= R_max < 10`
|
| 42 |
+
- B3: `10 <= R_max < 20`
|
| 43 |
+
- B4: `20 <= R_max < 100`
|
| 44 |
+
- B5: `100 <= R_max < 10_000`
|
| 45 |
|
| 46 |
Notes:
|
| 47 |
- If a bucket has too few samples, merge with a neighbor.
|
log.log
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbb4125009f74179d4f414c0145accbc2cbf3558be6b4d94a850241dd56aaab2
|
| 3 |
+
size 5084
|
models/vocabulary.py
CHANGED
|
@@ -187,13 +187,10 @@ EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
|
|
| 187 |
ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
|
| 188 |
NUM_EXCHANGES = len(EXCHANGES)
|
| 189 |
|
| 190 |
-
#
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
#
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
# Class 5: Manipulated (High return but suspicious metrics)
|
| 199 |
-
MANIPULATED_CLASS_ID = 5
|
|
|
|
| 187 |
ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
|
| 188 |
NUM_EXCHANGES = len(EXCHANGES)
|
| 189 |
|
| 190 |
+
# Return buckets used across analysis/scoring scripts.
|
| 191 |
+
# Split 3x-10x into 3x-5x and 5x-10x to reduce within-bucket heterogeneity.
|
| 192 |
+
RETURN_THRESHOLDS = [0, 3, 5, 10, 20, 100, 10000]
|
| 193 |
+
NUM_RETURN_CLASSES = len(RETURN_THRESHOLDS) - 1
|
| 194 |
+
|
| 195 |
+
# Manipulated (High return but suspicious metrics). Keep this as "after all return buckets".
|
| 196 |
+
MANIPULATED_CLASS_ID = NUM_RETURN_CLASSES
|
|
|
|
|
|
|
|
|
scripts/analyze_distribution.py
CHANGED
|
@@ -224,15 +224,13 @@ def analyze():
|
|
| 224 |
segments_tokens[c] = []
|
| 225 |
segments_tokens[c].append(t)
|
| 226 |
|
| 227 |
-
# Define Labels
|
| 228 |
-
labels = {
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
MANIPULATED_CLASS_ID: "5. MANIPULATED (Fake Metrics)"
|
| 235 |
-
}
|
| 236 |
|
| 237 |
# Common SQL parts
|
| 238 |
# We need a robust base for the WHERE clause variables (fees, vol, holders)
|
|
|
|
| 224 |
segments_tokens[c] = []
|
| 225 |
segments_tokens[c].append(t)
|
| 226 |
|
| 227 |
+
# Define Labels from thresholds so bucket changes don't silently desync output.
|
| 228 |
+
labels = {}
|
| 229 |
+
for i in range(len(thresholds) - 1):
|
| 230 |
+
lo = thresholds[i]
|
| 231 |
+
hi = thresholds[i + 1]
|
| 232 |
+
labels[i] = f"{i}. {lo}x - {hi}x"
|
| 233 |
+
labels[MANIPULATED_CLASS_ID] = f"{MANIPULATED_CLASS_ID}. MANIPULATED (Fake Metrics)"
|
|
|
|
|
|
|
| 234 |
|
| 235 |
# Common SQL parts
|
| 236 |
# We need a robust base for the WHERE clause variables (fees, vol, holders)
|
scripts/compute_quality_score.py
CHANGED
|
@@ -177,25 +177,22 @@ def fetch_token_metrics(client) -> List[dict]:
|
|
| 177 |
wh.mint_address AS token_address,
|
| 178 |
(sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
|
| 179 |
FROM (
|
| 180 |
-
SELECT
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
FROM wallet_holdings
|
| 182 |
GROUP BY mint_address, wallet_address
|
| 183 |
) wh
|
| 184 |
-
JOIN (
|
| 185 |
-
SELECT
|
| 186 |
-
wallet_address,
|
| 187 |
-
argMax(total_buys_count, updated_at) AS buys,
|
| 188 |
-
argMax(transfers_in_count, updated_at) AS transfers,
|
| 189 |
-
argMax(spl_transfers_in_count, updated_at) AS spl_transfers
|
| 190 |
-
FROM wallet_profile_metrics
|
| 191 |
-
GROUP BY wallet_address
|
| 192 |
-
) wpm ON wh.wallet_address = wpm.wallet_address
|
| 193 |
JOIN (
|
| 194 |
SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
|
| 195 |
FROM tokens
|
| 196 |
GROUP BY token_address
|
| 197 |
) t ON wh.mint_address = t.token_address
|
| 198 |
-
|
|
|
|
|
|
|
| 199 |
GROUP BY wh.mint_address, t.total_supply, t.decimals
|
| 200 |
)
|
| 201 |
SELECT
|
|
@@ -269,6 +266,10 @@ def _compute_quality_scores(
|
|
| 269 |
"q_raw": [],
|
| 270 |
"feature_pairs": {f[0]: [] for f in feature_defs},
|
| 271 |
"raw_pairs": {m: [] for m in raw_metrics},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
}
|
| 273 |
|
| 274 |
# Build bucket mapping
|
|
@@ -286,6 +287,18 @@ def _compute_quality_scores(
|
|
| 286 |
# Compute percentiles per bucket + feature
|
| 287 |
token_scores = []
|
| 288 |
for b, items in buckets.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
# Precompute percentiles per feature
|
| 290 |
feature_percentiles: Dict[str, Dict[str, float]] = {}
|
| 291 |
for fname, fget, _pos in feature_defs:
|
|
@@ -515,6 +528,43 @@ def print_diagnostics(debug: dict) -> None:
|
|
| 515 |
high_mean = sum(highs) / len(highs)
|
| 516 |
print(f" {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
|
| 517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
|
| 519 |
def main():
|
| 520 |
parser = argparse.ArgumentParser(description="Compute token quality/health score.")
|
|
|
|
| 177 |
wh.mint_address AS token_address,
|
| 178 |
(sum(wh.current_balance) / (t.total_supply / pow(10, t.decimals)) * 100) AS insiders_pct
|
| 179 |
FROM (
|
| 180 |
+
SELECT
|
| 181 |
+
mint_address,
|
| 182 |
+
wallet_address,
|
| 183 |
+
argMax(current_balance, updated_at) AS current_balance,
|
| 184 |
+
argMax(history_transfer_in, updated_at) AS history_transfer_in
|
| 185 |
FROM wallet_holdings
|
| 186 |
GROUP BY mint_address, wallet_address
|
| 187 |
) wh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
JOIN (
|
| 189 |
SELECT token_address, argMax(total_supply, updated_at) AS total_supply, argMax(decimals, updated_at) AS decimals
|
| 190 |
FROM tokens
|
| 191 |
GROUP BY token_address
|
| 192 |
) t ON wh.mint_address = t.token_address
|
| 193 |
+
-- Mint-specific "insiders": wallets that received the token via transfer-in at least once.
|
| 194 |
+
-- (Intentionally ignores whether they also bought; per request.)
|
| 195 |
+
WHERE wh.history_transfer_in > 0 AND t.total_supply > 0
|
| 196 |
GROUP BY wh.mint_address, t.total_supply, t.decimals
|
| 197 |
)
|
| 198 |
SELECT
|
|
|
|
| 266 |
"q_raw": [],
|
| 267 |
"feature_pairs": {f[0]: [] for f in feature_defs},
|
| 268 |
"raw_pairs": {m: [] for m in raw_metrics},
|
| 269 |
+
# For checking assumptions like "higher return buckets have lower bundled_pct".
|
| 270 |
+
# Store raw metric distributions per return bucket and (ret, metric) pairs overall.
|
| 271 |
+
"bucket_raw": {}, # bucket_id -> metric -> [raw vals]
|
| 272 |
+
"ret_pairs": {m: [] for m in raw_metrics}, # metric -> [(ret, raw_val)]
|
| 273 |
}
|
| 274 |
|
| 275 |
# Build bucket mapping
|
|
|
|
| 287 |
# Compute percentiles per bucket + feature
|
| 288 |
token_scores = []
|
| 289 |
for b, items in buckets.items():
|
| 290 |
+
if with_debug:
|
| 291 |
+
debug["bucket_raw"].setdefault(b, {m: [] for m in raw_metrics})
|
| 292 |
+
for d in items:
|
| 293 |
+
ret_val = d.get("ret")
|
| 294 |
+
for metric in raw_metrics:
|
| 295 |
+
raw_val = d.get(metric)
|
| 296 |
+
if raw_val is None:
|
| 297 |
+
continue
|
| 298 |
+
debug["bucket_raw"][b][metric].append(raw_val)
|
| 299 |
+
if ret_val is not None:
|
| 300 |
+
debug["ret_pairs"][metric].append((ret_val, raw_val))
|
| 301 |
+
|
| 302 |
# Precompute percentiles per feature
|
| 303 |
feature_percentiles: Dict[str, Dict[str, float]] = {}
|
| 304 |
for fname, fget, _pos in feature_defs:
|
|
|
|
| 528 |
high_mean = sum(highs) / len(highs)
|
| 529 |
print(f" {metric}: bottom_mean={low_mean:.4f} top_mean={high_mean:.4f} (n_low={len(lows)}, n_high={len(highs)})")
|
| 530 |
|
| 531 |
+
# Return bucket -> raw metric distributions (answers questions like "do higher-return tokens bundle less?")
|
| 532 |
+
bucket_raw = debug.get("bucket_raw", {})
|
| 533 |
+
if bucket_raw:
|
| 534 |
+
print("\n=== RETURN BUCKET RAW METRICS ===")
|
| 535 |
+
for b in sorted(bucket_raw.keys()):
|
| 536 |
+
print(f"\nSEGMENT: {b}. {_bucket_label(b)}")
|
| 537 |
+
for metric in sorted(bucket_raw[b].keys()):
|
| 538 |
+
vals = [v for v in bucket_raw[b][metric] if v is not None]
|
| 539 |
+
if not vals:
|
| 540 |
+
continue
|
| 541 |
+
stats = _summary_stats(vals)
|
| 542 |
+
# Also report how often the metric is > 0 (useful since many pct metrics are 0).
|
| 543 |
+
nz = sum(1 for v in vals if v > 0)
|
| 544 |
+
nz_rate = nz / len(vals)
|
| 545 |
+
print(
|
| 546 |
+
f" {metric}: mean={stats['mean']:.4f} p50={stats['p50']:.4f} "
|
| 547 |
+
f"p90={stats['p90']:.4f} p99={stats['p99']:.4f} nonzero_rate={nz_rate:.3f} (n={len(vals)})"
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
# Overall return-vs-metric correlation (not bucketed). Use log(ret) to reduce tail leverage.
|
| 551 |
+
ret_pairs = debug.get("ret_pairs", {})
|
| 552 |
+
if ret_pairs:
|
| 553 |
+
print("\n=== RETURN VS RAW METRICS (GLOBAL) ===")
|
| 554 |
+
for metric in sorted(ret_pairs.keys()):
|
| 555 |
+
pairs = ret_pairs[metric]
|
| 556 |
+
xs = []
|
| 557 |
+
ys = []
|
| 558 |
+
for r, v in pairs:
|
| 559 |
+
if r is None or r <= 0:
|
| 560 |
+
continue
|
| 561 |
+
xs.append(math.log(r))
|
| 562 |
+
ys.append(v)
|
| 563 |
+
if len(xs) < 3:
|
| 564 |
+
continue
|
| 565 |
+
corr = _pearson_corr(xs, ys)
|
| 566 |
+
print(f" log(ret) vs {metric}: {corr:.4f} (n={len(xs)})")
|
| 567 |
+
|
| 568 |
|
| 569 |
def main():
|
| 570 |
parser = argparse.ArgumentParser(description="Compute token quality/health score.")
|