Ciroc0 commited on
Commit
6d44284
·
verified ·
1 Parent(s): 18efa2c

Update arca-processor

Browse files
Files changed (2) hide show
  1. README.md +18 -10
  2. app.py +197 -88
README.md CHANGED
@@ -16,18 +16,26 @@ Pre-computed champion stats generator for ArcaThread.
16
 
17
  This space processes matchup-matrix parquet files from `arca-thread-priors` dataset and generates lightweight JSON files per champion.
18
 
19
- ## Output Structure
20
-
21
- ```
22
- champ-stats/{patch}/{championName}.json
23
- champ-stats/{patch}/tier-list.json
24
- ```
 
25
 
26
  ## Schedule
27
 
28
  Runs hourly to detect new patches and update stats.
29
 
30
- ## Environment Variables
31
-
32
- - `HF_TOKEN` - HuggingFace API token
33
- - `DATASET_REPO` - Source dataset (default: ArcaThread/arca-thread-priors)
 
 
 
 
 
 
 
 
16
 
17
  This space processes matchup-matrix parquet files from `arca-thread-priors` dataset and generates lightweight JSON files per champion.
18
 
19
+ ## Output Structure
20
+
21
+ ```
22
+ champ-stats/{patch}/{championId}.json
23
+ champ-stats/{patch}/tier-list.json
24
+ champ-stats/{patch}/meta.json
25
+ ```
26
 
27
  ## Schedule
28
 
29
  Runs hourly to detect new patches and update stats.
30
 
31
+ ## Environment Variables
32
+
33
+ - `HF_TOKEN` - HuggingFace API token
34
+ - `DATASET_REPO` - Source dataset (default: ArcaThread/arca-thread-priors)
35
+ - `PROCESS_INTERVAL_SECONDS` - Processing interval in seconds (default: 3600, min 60)
36
+ - `MIN_SAMPLE_SIZE` - Minimum sample size for champion aggregation (default: 100)
37
+ - `DATASET_FILE_CACHE_SECONDS` - TTL for cached `list_repo_files` index (default: 300, min 30)
38
+ - `TIER_MIN_GAMES` - Minimum games for tier-list eligibility (default: 500)
39
+ - `TIER_CALIBRATION_MODE` - `quantile` (default) or `static`
40
+ - `TIER_STATIC_S_MIN_WR`, `TIER_STATIC_A_MIN_WR`, `TIER_STATIC_B_MIN_WR`, `TIER_STATIC_C_MIN_WR`
41
+ - Used only when `TIER_CALIBRATION_MODE=static`
app.py CHANGED
@@ -6,9 +6,8 @@ ArcaThread Processor v1.0
6
  - Creates champ-stats/{patch}/{champion}.json files
7
  """
8
 
9
- import os
10
- import sys
11
- import json
12
  import time
13
  import re
14
  import threading
@@ -27,15 +26,22 @@ from hf_client import get_hf_api, get_hf_config
27
  HF_CFG = get_hf_config()
28
  HF_TOKEN = HF_CFG.token
29
  DATASET_REPO = HF_CFG.dataset_repo
30
- PROCESS_INTERVAL_SECONDS = max(60, int(os.environ.get("PROCESS_INTERVAL_SECONDS", "3600")))
31
- MIN_SAMPLE_SIZE = int(os.environ.get("MIN_SAMPLE_SIZE", "100"))
32
-
33
- RANKS = [
34
- "IRON", "BRONZE", "SILVER", "GOLD", "PLATINUM",
35
- "EMERALD", "DIAMOND", "MASTER", "GRANDMASTER", "CHALLENGER"
36
- ]
37
-
38
- ROLE_MAPPING = {'TOP': 0, 'JUNGLE': 1, 'MIDDLE': 2, 'BOTTOM': 3, 'SUPPORT': 4, 'UNKNOWN': 5}
 
 
 
 
 
 
 
39
 
40
  # Global state
41
  is_running = True
@@ -48,7 +54,12 @@ stats = {
48
  "last_processing_per_patch": {},
49
  "processing_history": []
50
  }
51
- state_lock = threading.Lock()
 
 
 
 
 
52
 
53
  app = FastAPI(title="ArcaThread Processor v1.0")
54
 
@@ -61,27 +72,61 @@ def log(msg: str):
61
  print(f"[{timestamp}] {msg}", flush=True)
62
 
63
 
64
- def _normalize_patch_token(value: str) -> Optional[str]:
65
  """Extract major.minor from patch string"""
66
  text = str(value or "").strip()
67
  match = re.match(r"^(\d+)\.(\d+)", text)
68
  if not match:
69
  return None
70
- return f"{match.group(1)}.{match.group(2)}"
71
-
72
-
73
- def _extract_champion_name(champion_id: int) -> str:
74
- """Convert champion ID to name (placeholder - will use ID as key)"""
75
- return str(champion_id)
76
-
77
-
78
- def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  """Load all matchup data for a specific patch across all ranks"""
80
  log(f"Loading matchup data for patch {patch}...")
81
 
82
  try:
83
- api = get_hf_api()
84
- all_files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
85
 
86
  # Filter for this patch's matchup files
87
  patch_files = [
@@ -127,11 +172,10 @@ def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
127
  return pd.DataFrame()
128
 
129
 
130
- def get_latest_patches(n: int = 3) -> List[str]:
131
  """Get the n latest patches from the dataset"""
132
  try:
133
- api = get_hf_api()
134
- all_files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
135
 
136
  patches = set()
137
  for f in all_files:
@@ -249,38 +293,80 @@ def compute_champion_stats(df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
249
  return result
250
 
251
 
252
- def generate_tier_list(stats_by_champion: Dict[str, Dict], min_games: int = 500) -> List[Dict]:
253
- """Generate tier list from champion stats"""
254
- tiers = []
255
-
256
- for champ_id, data in stats_by_champion.items():
257
- if data["total_games"] < min_games:
258
- continue
259
-
260
- win_rate = data["win_rate"]
261
-
262
- # Determine tier based on win rate
263
- if win_rate >= 0.54:
264
- tier = "S"
265
- elif win_rate >= 0.52:
266
- tier = "A"
267
- elif win_rate >= 0.50:
268
- tier = "B"
269
- elif win_rate >= 0.48:
270
- tier = "C"
271
- else:
272
- tier = "D"
273
-
274
- tiers.append({
275
- "champion_id": data["champion_id"],
276
- "tier": tier,
277
- "win_rate": win_rate,
278
- "games": data["total_games"],
279
- })
280
-
281
- # Sort by win rate descending
282
- tiers.sort(key=lambda x: x["win_rate"], reverse=True)
283
- return tiers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
 
286
  def build_upload_operation(local_path: str, repo_path: str) -> Optional[CommitOperationAdd]:
@@ -326,7 +412,7 @@ def upload_operations(operations: List[CommitOperationAdd], commit_message: str)
326
  return False
327
 
328
 
329
- def process_patch(patch: str) -> int:
330
  """Process a single patch and generate champion stats"""
331
  log(f"=" * 60)
332
  log(f"Processing patch: {patch}")
@@ -346,9 +432,28 @@ def process_patch(patch: str) -> int:
346
  log("No champions met the minimum sample size requirement")
347
  return 0
348
 
349
- # Generate tier list
350
- tier_list = generate_tier_list(champion_stats)
351
- log(f"Generated tier list with {len(tier_list)} champions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  # Save files locally
354
  temp_dir = f"/tmp/champ-stats/{patch}"
@@ -368,13 +473,14 @@ def process_patch(patch: str) -> int:
368
 
369
  # Save tier list
370
  tier_list_path = f"{temp_dir}/tier-list.json"
371
- with open(tier_list_path, 'w') as f:
372
- json.dump({
373
- "patch": patch,
374
- "generated_at": datetime.now().isoformat(),
375
- "total_champions": len(tier_list),
376
- "tiers": tier_list,
377
- }, f, indent=2)
 
378
 
379
  tier_op = build_upload_operation(tier_list_path, f"champ-stats/{patch}/tier-list.json")
380
  if tier_op:
@@ -382,14 +488,14 @@ def process_patch(patch: str) -> int:
382
 
383
  # Save patch metadata
384
  meta_path = f"{temp_dir}/meta.json"
385
- with open(meta_path, 'w') as f:
386
- json.dump({
387
- "patch": patch,
388
- "generated_at": datetime.now().isoformat(),
389
- "champions_count": len(champion_stats),
390
- "total_games": int(df['sample_size'].sum()) if 'sample_size' in df.columns else 0,
391
- "min_sample_size": MIN_SAMPLE_SIZE,
392
- }, f, indent=2)
393
 
394
  meta_op = build_upload_operation(meta_path, f"champ-stats/{patch}/meta.json")
395
  if meta_op:
@@ -406,13 +512,14 @@ def process_patch(patch: str) -> int:
406
  return 0
407
 
408
 
409
- def run_processing_cycle():
410
  """Run a complete processing cycle for latest patches"""
411
  global stats, last_processing
412
 
413
- log("=" * 60)
414
- log("STARTING PROCESSING CYCLE")
415
- log("=" * 60)
 
416
 
417
  # Get latest patches
418
  patches = get_latest_patches(n=3)
@@ -521,11 +628,13 @@ def health():
521
  "champions_processed": stats["champions_processed"],
522
  "patches_processed": stats["patches_processed"],
523
  },
524
- "config": {
525
- "process_interval_seconds": PROCESS_INTERVAL_SECONDS,
526
- "min_sample_size": MIN_SAMPLE_SIZE,
527
- }
528
- }
 
 
529
 
530
 
531
  @app.get("/trigger")
 
6
  - Creates champ-stats/{patch}/{champion}.json files
7
  """
8
 
9
+ import os
10
+ import json
 
11
  import time
12
  import re
13
  import threading
 
26
  HF_CFG = get_hf_config()
27
  HF_TOKEN = HF_CFG.token
28
  DATASET_REPO = HF_CFG.dataset_repo
29
+ PROCESS_INTERVAL_SECONDS = max(60, int(os.environ.get("PROCESS_INTERVAL_SECONDS", "3600")))
30
+ MIN_SAMPLE_SIZE = int(os.environ.get("MIN_SAMPLE_SIZE", "100"))
31
+ DATASET_FILE_CACHE_SECONDS = max(30, int(os.environ.get("DATASET_FILE_CACHE_SECONDS", "300")))
32
+ TIER_MIN_GAMES = max(1, int(os.environ.get("TIER_MIN_GAMES", "500")))
33
+ TIER_CALIBRATION_MODE = str(os.environ.get("TIER_CALIBRATION_MODE", "quantile")).strip().lower()
34
+ TIER_STATIC_THRESHOLDS = (
35
+ float(os.environ.get("TIER_STATIC_S_MIN_WR", "0.54")),
36
+ float(os.environ.get("TIER_STATIC_A_MIN_WR", "0.52")),
37
+ float(os.environ.get("TIER_STATIC_B_MIN_WR", "0.50")),
38
+ float(os.environ.get("TIER_STATIC_C_MIN_WR", "0.48")),
39
+ )
40
+
41
+ RANKS = [
42
+ "IRON", "BRONZE", "SILVER", "GOLD", "PLATINUM",
43
+ "EMERALD", "DIAMOND", "MASTER", "GRANDMASTER", "CHALLENGER"
44
+ ]
45
 
46
  # Global state
47
  is_running = True
 
54
  "last_processing_per_patch": {},
55
  "processing_history": []
56
  }
57
+ state_lock = threading.Lock()
58
+ dataset_file_cache_lock = threading.Lock()
59
+ dataset_file_cache = {
60
+ "timestamp": 0.0,
61
+ "files": [],
62
+ }
63
 
64
  app = FastAPI(title="ArcaThread Processor v1.0")
65
 
 
72
  print(f"[{timestamp}] {msg}", flush=True)
73
 
74
 
75
+ def _normalize_patch_token(value: str) -> Optional[str]:
76
  """Extract major.minor from patch string"""
77
  text = str(value or "").strip()
78
  match = re.match(r"^(\d+)\.(\d+)", text)
79
  if not match:
80
  return None
81
+ return f"{match.group(1)}.{match.group(2)}"
82
+
83
+
84
+ def list_dataset_files(force_refresh: bool = False) -> List[str]:
85
+ """List dataset files with a short-lived cache."""
86
+ now = time.time()
87
+ with dataset_file_cache_lock:
88
+ cached_files = dataset_file_cache.get("files", [])
89
+ cached_at = float(dataset_file_cache.get("timestamp", 0.0) or 0.0)
90
+ if (
91
+ not force_refresh
92
+ and cached_files
93
+ and (now - cached_at) < DATASET_FILE_CACHE_SECONDS
94
+ ):
95
+ return list(cached_files)
96
+
97
+ files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
98
+ with dataset_file_cache_lock:
99
+ dataset_file_cache["files"] = list(files)
100
+ dataset_file_cache["timestamp"] = now
101
+ return files
102
+
103
+
104
+ def load_existing_patch_meta(patch: str) -> Optional[Dict[str, Any]]:
105
+ """Load existing meta for a patch if present."""
106
+ meta_path = f"champ-stats/{patch}/meta.json"
107
+ try:
108
+ local_path = hf_hub_download(
109
+ repo_id=DATASET_REPO,
110
+ filename=meta_path,
111
+ repo_type="dataset",
112
+ token=HF_TOKEN,
113
+ local_dir="/tmp",
114
+ )
115
+ with open(local_path, "r", encoding="utf-8") as handle:
116
+ payload = json.load(handle)
117
+ if isinstance(payload, dict):
118
+ return payload
119
+ except Exception:
120
+ return None
121
+ return None
122
+
123
+
124
+ def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
125
  """Load all matchup data for a specific patch across all ranks"""
126
  log(f"Loading matchup data for patch {patch}...")
127
 
128
  try:
129
+ all_files = list_dataset_files()
 
130
 
131
  # Filter for this patch's matchup files
132
  patch_files = [
 
172
  return pd.DataFrame()
173
 
174
 
175
+ def get_latest_patches(n: int = 3) -> List[str]:
176
  """Get the n latest patches from the dataset"""
177
  try:
178
+ all_files = list_dataset_files()
 
179
 
180
  patches = set()
181
  for f in all_files:
 
293
  return result
294
 
295
 
296
+ def _resolve_tier_thresholds(win_rates: List[float]) -> tuple:
297
+ """
298
+ Resolve tier thresholds.
299
+ - quantile mode: patch-adaptive cutoffs from current win-rate distribution.
300
+ - static mode: fixed win-rate cutoffs.
301
+ """
302
+ if TIER_CALIBRATION_MODE == "quantile" and len(win_rates) >= 10:
303
+ quantiles = np.quantile(np.asarray(win_rates, dtype=np.float32), [0.8, 0.6, 0.4, 0.2])
304
+ s_min, a_min, b_min, c_min = [float(v) for v in quantiles]
305
+ return s_min, a_min, b_min, c_min, "quantile"
306
+ s_min, a_min, b_min, c_min = TIER_STATIC_THRESHOLDS
307
+ return float(s_min), float(a_min), float(b_min), float(c_min), "static"
308
+
309
+
310
+ def _assign_tier(win_rate: float, thresholds: tuple) -> str:
311
+ s_min, a_min, b_min, c_min = thresholds
312
+ if win_rate >= s_min:
313
+ return "S"
314
+ if win_rate >= a_min:
315
+ return "A"
316
+ if win_rate >= b_min:
317
+ return "B"
318
+ if win_rate >= c_min:
319
+ return "C"
320
+ return "D"
321
+
322
+
323
+ def generate_tier_list(
324
+ stats_by_champion: Dict[str, Dict],
325
+ min_games: Optional[int] = None
326
+ ) -> tuple[List[Dict], Dict[str, Any]]:
327
+ """Generate tier list from champion stats with explicit calibration metadata."""
328
+ minimum_games = max(1, int(min_games if min_games is not None else TIER_MIN_GAMES))
329
+ candidates = [
330
+ data for data in stats_by_champion.values()
331
+ if int(data.get("total_games", 0) or 0) >= minimum_games
332
+ ]
333
+ if not candidates:
334
+ calibration = {
335
+ "mode": "none",
336
+ "min_games": minimum_games,
337
+ "thresholds": {"S": None, "A": None, "B": None, "C": None},
338
+ "eligible_champions": 0,
339
+ }
340
+ return [], calibration
341
+
342
+ win_rates = [float(data.get("win_rate", 0.5) or 0.5) for data in candidates]
343
+ s_min, a_min, b_min, c_min, used_mode = _resolve_tier_thresholds(win_rates)
344
+ thresholds = (s_min, a_min, b_min, c_min)
345
+
346
+ tiers = []
347
+ for data in candidates:
348
+ win_rate = float(data.get("win_rate", 0.5) or 0.5)
349
+ tier = _assign_tier(win_rate, thresholds)
350
+ tiers.append({
351
+ "champion_id": int(data.get("champion_id", 0) or 0),
352
+ "tier": tier,
353
+ "win_rate": win_rate,
354
+ "games": int(data.get("total_games", 0) or 0),
355
+ })
356
+
357
+ tiers.sort(key=lambda x: x["win_rate"], reverse=True)
358
+ calibration = {
359
+ "mode": used_mode,
360
+ "min_games": minimum_games,
361
+ "thresholds": {
362
+ "S": round(s_min, 4),
363
+ "A": round(a_min, 4),
364
+ "B": round(b_min, 4),
365
+ "C": round(c_min, 4),
366
+ },
367
+ "eligible_champions": len(candidates),
368
+ }
369
+ return tiers, calibration
370
 
371
 
372
  def build_upload_operation(local_path: str, repo_path: str) -> Optional[CommitOperationAdd]:
 
412
  return False
413
 
414
 
415
+ def process_patch(patch: str) -> int:
416
  """Process a single patch and generate champion stats"""
417
  log(f"=" * 60)
418
  log(f"Processing patch: {patch}")
 
432
  log("No champions met the minimum sample size requirement")
433
  return 0
434
 
435
+ # Generate tier list
436
+ tier_list, tier_calibration = generate_tier_list(champion_stats)
437
+ log(f"Generated tier list with {len(tier_list)} champions")
438
+
439
+ total_games = int(df['sample_size'].sum()) if 'sample_size' in df.columns else 0
440
+ meta_core = {
441
+ "patch": patch,
442
+ "champions_count": len(champion_stats),
443
+ "total_games": total_games,
444
+ "min_sample_size": MIN_SAMPLE_SIZE,
445
+ }
446
+ existing_meta = load_existing_patch_meta(patch)
447
+ if existing_meta:
448
+ existing_core = {
449
+ "patch": str(existing_meta.get("patch", "")),
450
+ "champions_count": int(existing_meta.get("champions_count", -1) or -1),
451
+ "total_games": int(existing_meta.get("total_games", -1) or -1),
452
+ "min_sample_size": int(existing_meta.get("min_sample_size", -1) or -1),
453
+ }
454
+ if existing_core == meta_core:
455
+ log(f"No material changes for patch {patch}; skipping upload")
456
+ return len(champion_stats)
457
 
458
  # Save files locally
459
  temp_dir = f"/tmp/champ-stats/{patch}"
 
473
 
474
  # Save tier list
475
  tier_list_path = f"{temp_dir}/tier-list.json"
476
+ with open(tier_list_path, 'w') as f:
477
+ json.dump({
478
+ "patch": patch,
479
+ "generated_at": datetime.now().isoformat(),
480
+ "total_champions": len(tier_list),
481
+ "calibration": tier_calibration,
482
+ "tiers": tier_list,
483
+ }, f, indent=2)
484
 
485
  tier_op = build_upload_operation(tier_list_path, f"champ-stats/{patch}/tier-list.json")
486
  if tier_op:
 
488
 
489
  # Save patch metadata
490
  meta_path = f"{temp_dir}/meta.json"
491
+ with open(meta_path, 'w') as f:
492
+ json.dump({
493
+ "patch": patch,
494
+ "generated_at": datetime.now().isoformat(),
495
+ "champions_count": len(champion_stats),
496
+ "total_games": total_games,
497
+ "min_sample_size": MIN_SAMPLE_SIZE,
498
+ }, f, indent=2)
499
 
500
  meta_op = build_upload_operation(meta_path, f"champ-stats/{patch}/meta.json")
501
  if meta_op:
 
512
  return 0
513
 
514
 
515
+ def run_processing_cycle():
516
  """Run a complete processing cycle for latest patches"""
517
  global stats, last_processing
518
 
519
+ log("=" * 60)
520
+ log("STARTING PROCESSING CYCLE")
521
+ log("=" * 60)
522
+ list_dataset_files(force_refresh=True)
523
 
524
  # Get latest patches
525
  patches = get_latest_patches(n=3)
 
628
  "champions_processed": stats["champions_processed"],
629
  "patches_processed": stats["patches_processed"],
630
  },
631
+ "config": {
632
+ "process_interval_seconds": PROCESS_INTERVAL_SECONDS,
633
+ "min_sample_size": MIN_SAMPLE_SIZE,
634
+ "tier_min_games": TIER_MIN_GAMES,
635
+ "tier_calibration_mode": TIER_CALIBRATION_MODE,
636
+ }
637
+ }
638
 
639
 
640
  @app.get("/trigger")