quantumly commited on
Commit
985bd86
·
verified ·
1 Parent(s): 3689239

v0.7 wash filtering + recalibration: 2026-04-27

Browse files
Files changed (1) hide show
  1. v0_7_metadata.json +526 -0
v0_7_metadata.json ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trained_at": "2026-04-27T08:32:28.319255+00:00",
3
+ "data_run_date": "2026-04-25",
4
+ "llm_run_date": "2026-04-26",
5
+ "version": "v0.7-wash-recalibration",
6
+ "description": "v0.6 features + wash-trade detection/down-weighting + dropped tail correction + conformal quantile recalibration",
7
+ "parent_version": "v0.6",
8
+ "changes_from_parent": [
9
+ "Wash-trade detection (round-trip \u22647d, sub-minute resale, wallet concentration >70%)",
10
+ "Down-weight wash rows in training (weight=0.1)",
11
+ "DROPPED tail-correction stacking (overfit on v0.6, test \u0394 R\u00b2 = -0.027)",
12
+ "Conformal quantile recalibration via val set"
13
+ ],
14
+ "embedders": [
15
+ "mpnet-finetuned (from v0.3)",
16
+ "BAAI/bge-base-en-v1.5"
17
+ ],
18
+ "splits": {
19
+ "train": {
20
+ "rows": 265240,
21
+ "start": "2022-01-28",
22
+ "end": "2023-09-30"
23
+ },
24
+ "val": {
25
+ "rows": 3545,
26
+ "start": "2023-10-01",
27
+ "end": "2023-12-31"
28
+ },
29
+ "test": {
30
+ "rows": 2744,
31
+ "start": "2024-01-01",
32
+ "end": "2024-05-04"
33
+ }
34
+ },
35
+ "feature_count": 212,
36
+ "feature_cols": [
37
+ "pca_000",
38
+ "pca_001",
39
+ "pca_002",
40
+ "pca_003",
41
+ "pca_004",
42
+ "pca_005",
43
+ "pca_006",
44
+ "pca_007",
45
+ "pca_008",
46
+ "pca_009",
47
+ "pca_010",
48
+ "pca_011",
49
+ "pca_012",
50
+ "pca_013",
51
+ "pca_014",
52
+ "pca_015",
53
+ "pca_016",
54
+ "pca_017",
55
+ "pca_018",
56
+ "pca_019",
57
+ "pca_020",
58
+ "pca_021",
59
+ "pca_022",
60
+ "pca_023",
61
+ "pca_024",
62
+ "pca_025",
63
+ "pca_026",
64
+ "pca_027",
65
+ "pca_028",
66
+ "pca_029",
67
+ "pca_030",
68
+ "pca_031",
69
+ "pca_032",
70
+ "pca_033",
71
+ "pca_034",
72
+ "pca_035",
73
+ "pca_036",
74
+ "pca_037",
75
+ "pca_038",
76
+ "pca_039",
77
+ "pca_040",
78
+ "pca_041",
79
+ "pca_042",
80
+ "pca_043",
81
+ "pca_044",
82
+ "pca_045",
83
+ "pca_046",
84
+ "pca_047",
85
+ "pca_048",
86
+ "pca_049",
87
+ "pca_050",
88
+ "pca_051",
89
+ "pca_052",
90
+ "pca_053",
91
+ "pca_054",
92
+ "pca_055",
93
+ "pca_056",
94
+ "pca_057",
95
+ "pca_058",
96
+ "pca_059",
97
+ "pca_060",
98
+ "pca_061",
99
+ "pca_062",
100
+ "pca_063",
101
+ "len",
102
+ "n_digits",
103
+ "n_letters",
104
+ "n_special",
105
+ "n_lower",
106
+ "n_upper",
107
+ "is_palindrome",
108
+ "is_all_digits",
109
+ "is_all_letters",
110
+ "is_ascii",
111
+ "has_unicode",
112
+ "starts_digit",
113
+ "ends_digit",
114
+ "max_char_run",
115
+ "n_unique_chars",
116
+ "in_wikipedia",
117
+ "in_geonames",
118
+ "in_us_firstname",
119
+ "in_iso3166",
120
+ "in_ticker",
121
+ "in_sec_edgar",
122
+ "in_wiktionary_en",
123
+ "wordlist_hits",
124
+ "club__prepunk_full_rankings",
125
+ "club__historic_figures",
126
+ "club__gamertags",
127
+ "club__firstnames_usa",
128
+ "club__top500_cities_global",
129
+ "club__familynames_usa",
130
+ "club__social_handles",
131
+ "club__country_codes",
132
+ "club__sports",
133
+ "club__crypto_terms",
134
+ "club__common_english",
135
+ "club__top500_cities_usa",
136
+ "club__top_nouns",
137
+ "club__home",
138
+ "club__conspiracy_theories",
139
+ "club__us_government",
140
+ "club__performing_arts",
141
+ "club__gamertags_double",
142
+ "club__mythical_creatures",
143
+ "club__finance_terms",
144
+ "club__catholicism",
145
+ "club__natural_wonders",
146
+ "club__fine_art",
147
+ "club__personas",
148
+ "club__luxury",
149
+ "club__pokemon_gen1",
150
+ "club__memes",
151
+ "club__top_crypto_tickers",
152
+ "club__currency_symbols",
153
+ "club__common_animals",
154
+ "club__logistics",
155
+ "club__currency_names",
156
+ "club__wikidata_top_fantasy_char",
157
+ "club__top_crypto_names",
158
+ "club__gen_alpha",
159
+ "club__pokemon_gen3",
160
+ "club__holidays",
161
+ "club__pokemon_gen2",
162
+ "club__paranormal",
163
+ "club__crayola_classic",
164
+ "club__pokemon_gen4",
165
+ "club__us_states",
166
+ "n_clubs",
167
+ "trademark_conflict",
168
+ "name_age_days",
169
+ "prior_transfer_count",
170
+ "fg_value",
171
+ "eth_tvl_usd",
172
+ "eth_stable_mcap",
173
+ "eth_dex_volume",
174
+ "nft_total_fee_usd",
175
+ "fame_score",
176
+ "crypto_relevance_ord",
177
+ "brand_collision_risk_ord",
178
+ "kind__concept",
179
+ "kind__random",
180
+ "kind__brand",
181
+ "kind__surname",
182
+ "kind__first_name",
183
+ "kind__abbreviation",
184
+ "kind__place",
185
+ "kind__other",
186
+ "kind__unknown",
187
+ "origin__english",
188
+ "origin__none",
189
+ "origin__mixed",
190
+ "origin__spanish",
191
+ "origin__german",
192
+ "origin__french",
193
+ "origin__japanese",
194
+ "origin__chinese",
195
+ "origin__italian",
196
+ "origin__slavic",
197
+ "origin__korean",
198
+ "origin__arabic",
199
+ "origin__other",
200
+ "origin__unknown",
201
+ "desc_pca_000",
202
+ "desc_pca_001",
203
+ "desc_pca_002",
204
+ "desc_pca_003",
205
+ "desc_pca_004",
206
+ "desc_pca_005",
207
+ "desc_pca_006",
208
+ "desc_pca_007",
209
+ "desc_pca_008",
210
+ "desc_pca_009",
211
+ "desc_pca_010",
212
+ "desc_pca_011",
213
+ "desc_pca_012",
214
+ "desc_pca_013",
215
+ "desc_pca_014",
216
+ "desc_pca_015",
217
+ "desc_pca_016",
218
+ "desc_pca_017",
219
+ "desc_pca_018",
220
+ "desc_pca_019",
221
+ "desc_pca_020",
222
+ "desc_pca_021",
223
+ "desc_pca_022",
224
+ "desc_pca_023",
225
+ "desc_pca_024",
226
+ "desc_pca_025",
227
+ "desc_pca_026",
228
+ "desc_pca_027",
229
+ "desc_pca_028",
230
+ "desc_pca_029",
231
+ "desc_pca_030",
232
+ "desc_pca_031",
233
+ "knnmp_count",
234
+ "knnmp_mean_log",
235
+ "knnmp_median_log",
236
+ "knnmp_p90_log",
237
+ "knnmp_max_sim",
238
+ "knnmp_min_sim",
239
+ "knnmp_log_max",
240
+ "knnmp_log_min",
241
+ "knnbg_count",
242
+ "knnbg_mean_log",
243
+ "knnbg_median_log",
244
+ "knnbg_p90_log",
245
+ "knnbg_max_sim",
246
+ "knnbg_min_sim",
247
+ "knnbg_log_max",
248
+ "knnbg_log_min"
249
+ ],
250
+ "pca_dim_concat": 64,
251
+ "pca_dim_description": 32,
252
+ "name_kind_values": [
253
+ "concept",
254
+ "random",
255
+ "brand",
256
+ "surname",
257
+ "first_name",
258
+ "abbreviation",
259
+ "place",
260
+ "other",
261
+ "unknown"
262
+ ],
263
+ "cultural_origin_values": [
264
+ "english",
265
+ "none",
266
+ "mixed",
267
+ "spanish",
268
+ "german",
269
+ "french",
270
+ "japanese",
271
+ "chinese",
272
+ "italian",
273
+ "slavic",
274
+ "korean",
275
+ "arabic",
276
+ "other",
277
+ "unknown"
278
+ ],
279
+ "wash_filtering": {
280
+ "n_round_trip_labels": 23049,
281
+ "n_sub_minute_labels": 1222,
282
+ "n_high_conc_labels": 26255,
283
+ "n_total_flagged_labels": 39995,
284
+ "wash_train_weight": 0.1,
285
+ "train_wash_pct": 41.393832001206455
286
+ },
287
+ "best_xgb_params": {
288
+ "tree_method": "hist",
289
+ "device": "cuda",
290
+ "seed": 42,
291
+ "max_depth": 11,
292
+ "learning_rate": 0.00607909965562535,
293
+ "subsample": 0.5242971482550959,
294
+ "colsample_bytree": 0.5178101703841028,
295
+ "colsample_bylevel": 0.5676418253037602,
296
+ "min_child_weight": 3,
297
+ "reg_alpha": 4.318424044857591,
298
+ "reg_lambda": 1.7796015928106095,
299
+ "gamma": 2.982764564516846
300
+ },
301
+ "optuna": {
302
+ "n_trials": 30,
303
+ "best_val_rmse": 1.016706943511963,
304
+ "best_params": {
305
+ "max_depth": 11,
306
+ "learning_rate": 0.00607909965562535,
307
+ "subsample": 0.5242971482550959,
308
+ "colsample_bytree": 0.5178101703841028,
309
+ "colsample_bylevel": 0.5676418253037602,
310
+ "min_child_weight": 3,
311
+ "reg_alpha": 4.318424044857591,
312
+ "reg_lambda": 1.7796015928106095,
313
+ "gamma": 2.982764564516846
314
+ },
315
+ "best_trial": 28,
316
+ "warm_started_with_v0_6_best": true
317
+ },
318
+ "quantile_models": {
319
+ "q05": {
320
+ "best_iteration": 1527,
321
+ "best_val_rmse": 1.9560544421029673
322
+ },
323
+ "q50": {
324
+ "best_iteration": 1834,
325
+ "best_val_rmse": 1.045858812580417
326
+ },
327
+ "q95": {
328
+ "best_iteration": 3355,
329
+ "best_val_rmse": 2.1802895661452273
330
+ }
331
+ },
332
+ "calibration": {
333
+ "method": "additive",
334
+ "target_coverage": 0.9,
335
+ "delta_lower_additive": 0.08095530420541763,
336
+ "delta_upper_additive": 0.0,
337
+ "multiplicative_factor": 1.239099800588292,
338
+ "val_coverage_after": 0.8595204513399154
339
+ },
340
+ "metrics": {
341
+ "final": {
342
+ "train": {
343
+ "r2_log": 0.7497193813323975,
344
+ "rmse_log": 0.8078381419181824,
345
+ "mae_log": 0.47278380393981934,
346
+ "median_ape": 0.23963597416877747,
347
+ "bias_log": -0.1186392605304718
348
+ },
349
+ "val": {
350
+ "r2_log": 0.6740628480911255,
351
+ "rmse_log": 1.0458588600158691,
352
+ "mae_log": 0.7022704482078552,
353
+ "median_ape": 0.413220077753067,
354
+ "bias_log": -0.09871374070644379
355
+ },
356
+ "test": {
357
+ "r2_log": 0.4626653790473938,
358
+ "rmse_log": 1.363203525543213,
359
+ "mae_log": 1.0832252502441406,
360
+ "median_ape": 0.9336060285568237,
361
+ "bias_log": 0.4047386348247528
362
+ }
363
+ },
364
+ "coverage": {
365
+ "train": {
366
+ "coverage_90pct": 0.920309908007842,
367
+ "median_interval_log": 2.7887799739837646,
368
+ "median_interval_ratio": 16.261168645984977
369
+ },
370
+ "val": {
371
+ "coverage_90pct": 0.8595204513399154,
372
+ "median_interval_log": 3.658205986022949,
373
+ "median_interval_ratio": 38.79168757902037
374
+ },
375
+ "test": {
376
+ "coverage_90pct": 0.8083090379008746,
377
+ "median_interval_log": 4.073790550231934,
378
+ "median_interval_ratio": 58.77934691322367
379
+ }
380
+ }
381
+ },
382
+ "top_features": [
383
+ {
384
+ "name": "len",
385
+ "gain": 58.87883758544922
386
+ },
387
+ {
388
+ "name": "knnmp_mean_log",
389
+ "gain": 46.72057342529297
390
+ },
391
+ {
392
+ "name": "knnmp_median_log",
393
+ "gain": 41.08598327636719
394
+ },
395
+ {
396
+ "name": "is_all_digits",
397
+ "gain": 38.29785919189453
398
+ },
399
+ {
400
+ "name": "knnmp_count",
401
+ "gain": 34.5153923034668
402
+ },
403
+ {
404
+ "name": "knnmp_p90_log",
405
+ "gain": 31.491409301757812
406
+ },
407
+ {
408
+ "name": "in_wikipedia",
409
+ "gain": 30.60625457763672
410
+ },
411
+ {
412
+ "name": "knnmp_log_max",
413
+ "gain": 24.936614990234375
414
+ },
415
+ {
416
+ "name": "is_ascii",
417
+ "gain": 24.386781692504883
418
+ },
419
+ {
420
+ "name": "ends_digit",
421
+ "gain": 24.157560348510742
422
+ },
423
+ {
424
+ "name": "n_digits",
425
+ "gain": 24.11026382446289
426
+ },
427
+ {
428
+ "name": "has_unicode",
429
+ "gain": 23.687318801879883
430
+ },
431
+ {
432
+ "name": "name_age_days",
433
+ "gain": 22.8796443939209
434
+ },
435
+ {
436
+ "name": "origin__none",
437
+ "gain": 22.28803062438965
438
+ },
439
+ {
440
+ "name": "eth_stable_mcap",
441
+ "gain": 22.02800941467285
442
+ },
443
+ {
444
+ "name": "pca_002",
445
+ "gain": 21.55613136291504
446
+ },
447
+ {
448
+ "name": "knnbg_count",
449
+ "gain": 20.121904373168945
450
+ },
451
+ {
452
+ "name": "knnmp_log_min",
453
+ "gain": 19.98278045654297
454
+ },
455
+ {
456
+ "name": "n_unique_chars",
457
+ "gain": 19.560546875
458
+ },
459
+ {
460
+ "name": "kind__random",
461
+ "gain": 19.375856399536133
462
+ },
463
+ {
464
+ "name": "starts_digit",
465
+ "gain": 18.9150390625
466
+ },
467
+ {
468
+ "name": "origin__chinese",
469
+ "gain": 18.488323211669922
470
+ },
471
+ {
472
+ "name": "eth_tvl_usd",
473
+ "gain": 18.1763858795166
474
+ },
475
+ {
476
+ "name": "n_lower",
477
+ "gain": 17.70460319519043
478
+ },
479
+ {
480
+ "name": "n_letters",
481
+ "gain": 17.535446166992188
482
+ },
483
+ {
484
+ "name": "pca_004",
485
+ "gain": 16.79183578491211
486
+ },
487
+ {
488
+ "name": "kind__abbreviation",
489
+ "gain": 16.659326553344727
490
+ },
491
+ {
492
+ "name": "desc_pca_000",
493
+ "gain": 16.628366470336914
494
+ },
495
+ {
496
+ "name": "is_palindrome",
497
+ "gain": 16.464502334594727
498
+ },
499
+ {
500
+ "name": "fg_value",
501
+ "gain": 15.839948654174805
502
+ }
503
+ ],
504
+ "family_gain_split": {
505
+ "mpnet_ft_knn": 217.51888847351074,
506
+ "bge_knn": 67.69801044464111,
507
+ "llm_kind": 68.03838443756104,
508
+ "llm_origin": 118.48860311508179,
509
+ "llm_scores": 37.4162712097168,
510
+ "llm_desc": 208.6804609298706,
511
+ "llm_total": 432.6237196922302
512
+ },
513
+ "inference_recipe": {
514
+ "description": "Inference uses 3 models + calibration constants",
515
+ "point_estimate": "final_log = q50_model(features)",
516
+ "uncalibrated_low": "low_log_raw = q05_model(features)",
517
+ "uncalibrated_high": "high_log_raw = q95_model(features)",
518
+ "calibrated_low_additive": "low_log = low_log_raw - delta_lower_additive",
519
+ "calibrated_high_additive": "high_log = high_log_raw + delta_upper_additive",
520
+ "calibrated_low_multiplicative": "low_log = q50 - multiplicative_factor * (q50 - low_log_raw)",
521
+ "calibrated_high_multiplicative": "high_log = q50 + multiplicative_factor * (high_log_raw - q50)",
522
+ "use_method": "additive",
523
+ "output_usd": "np.exp(final_log)"
524
+ },
525
+ "wandb_run": "https://wandb.ai/quantumly-aletheia-research/ens-appraiser/runs/xd2jhbwk"
526
+ }