Spaces:

ArcaThread
/

arca-processor

Running

App Files Files Community

Ciroc0 commited on 18 days ago

Commit

6d44284

verified ·

1 Parent(s): 18efa2c

Update arca-processor

Browse files

Files changed (2) hide show

README.md +18 -10
app.py +197 -88

README.md CHANGED Viewed

@@ -16,18 +16,26 @@ Pre-computed champion stats generator for ArcaThread.
 This space processes matchup-matrix parquet files from `arca-thread-priors` dataset and generates lightweight JSON files per champion.
-## Output Structure
-```
-champ-stats/{patch}/{championName}.json
-champ-stats/{patch}/tier-list.json
-```
 ## Schedule
 Runs hourly to detect new patches and update stats.
-## Environment Variables
-- `HF_TOKEN` - HuggingFace API token
-- `DATASET_REPO` - Source dataset (default: ArcaThread/arca-thread-priors)

 This space processes matchup-matrix parquet files from `arca-thread-priors` dataset and generates lightweight JSON files per champion.
+## Output Structure
+```
+champ-stats/{patch}/{championId}.json
+champ-stats/{patch}/tier-list.json
+champ-stats/{patch}/meta.json
+```
 ## Schedule
 Runs hourly to detect new patches and update stats.
+## Environment Variables
+- `HF_TOKEN` - HuggingFace API token
+- `DATASET_REPO` - Source dataset (default: ArcaThread/arca-thread-priors)
+- `PROCESS_INTERVAL_SECONDS` - Processing interval in seconds (default: 3600, min 60)
+- `MIN_SAMPLE_SIZE` - Minimum sample size for champion aggregation (default: 100)
+- `DATASET_FILE_CACHE_SECONDS` - TTL for cached `list_repo_files` index (default: 300, min 30)
+- `TIER_MIN_GAMES` - Minimum games for tier-list eligibility (default: 500)
+- `TIER_CALIBRATION_MODE` - `quantile` (default) or `static`
+- `TIER_STATIC_S_MIN_WR`, `TIER_STATIC_A_MIN_WR`, `TIER_STATIC_B_MIN_WR`, `TIER_STATIC_C_MIN_WR`
+  - Used only when `TIER_CALIBRATION_MODE=static`

app.py CHANGED Viewed

@@ -6,9 +6,8 @@ ArcaThread Processor v1.0
 - Creates champ-stats/{patch}/{champion}.json files
 """
-import os
-import sys
-import json
 import time
 import re
 import threading
@@ -27,15 +26,22 @@ from hf_client import get_hf_api, get_hf_config
 HF_CFG = get_hf_config()
 HF_TOKEN = HF_CFG.token
 DATASET_REPO = HF_CFG.dataset_repo
-PROCESS_INTERVAL_SECONDS = max(60, int(os.environ.get("PROCESS_INTERVAL_SECONDS", "3600")))
-MIN_SAMPLE_SIZE = int(os.environ.get("MIN_SAMPLE_SIZE", "100"))
-RANKS = [
-    "IRON", "BRONZE", "SILVER", "GOLD", "PLATINUM",
-    "EMERALD", "DIAMOND", "MASTER", "GRANDMASTER", "CHALLENGER"
-]
-ROLE_MAPPING = {'TOP': 0, 'JUNGLE': 1, 'MIDDLE': 2, 'BOTTOM': 3, 'SUPPORT': 4, 'UNKNOWN': 5}
 # Global state
 is_running = True
@@ -48,7 +54,12 @@ stats = {
     "last_processing_per_patch": {},
     "processing_history": []
 }
-state_lock = threading.Lock()
 app = FastAPI(title="ArcaThread Processor v1.0")
@@ -61,27 +72,61 @@ def log(msg: str):
     print(f"[{timestamp}] {msg}", flush=True)
-def _normalize_patch_token(value: str) -> Optional[str]:
     """Extract major.minor from patch string"""
     text = str(value or "").strip()
     match = re.match(r"^(\d+)\.(\d+)", text)
     if not match:
         return None
-    return f"{match.group(1)}.{match.group(2)}"
-def _extract_champion_name(champion_id: int) -> str:
-    """Convert champion ID to name (placeholder - will use ID as key)"""
-    return str(champion_id)
-def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
     """Load all matchup data for a specific patch across all ranks"""
     log(f"Loading matchup data for patch {patch}...")
     try:
-        api = get_hf_api()
-        all_files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         # Filter for this patch's matchup files
         patch_files = [
@@ -127,11 +172,10 @@ def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
         return pd.DataFrame()
-def get_latest_patches(n: int = 3) -> List[str]:
     """Get the n latest patches from the dataset"""
     try:
-        api = get_hf_api()
-        all_files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         patches = set()
         for f in all_files:
@@ -249,38 +293,80 @@ def compute_champion_stats(df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
     return result
-def generate_tier_list(stats_by_champion: Dict[str, Dict], min_games: int = 500) -> List[Dict]:
-    """Generate tier list from champion stats"""
-    tiers = []
-    for champ_id, data in stats_by_champion.items():
-        if data["total_games"] < min_games:
-            continue
-        win_rate = data["win_rate"]
-        # Determine tier based on win rate
-        if win_rate >= 0.54:
-            tier = "S"
-        elif win_rate >= 0.52:
-            tier = "A"
-        elif win_rate >= 0.50:
-            tier = "B"
-        elif win_rate >= 0.48:
-            tier = "C"
-        else:
-            tier = "D"
-        tiers.append({
-            "champion_id": data["champion_id"],
-            "tier": tier,
-            "win_rate": win_rate,
-            "games": data["total_games"],
-        })
-    # Sort by win rate descending
-    tiers.sort(key=lambda x: x["win_rate"], reverse=True)
-    return tiers
 def build_upload_operation(local_path: str, repo_path: str) -> Optional[CommitOperationAdd]:
@@ -326,7 +412,7 @@ def upload_operations(operations: List[CommitOperationAdd], commit_message: str)
         return False
-def process_patch(patch: str) -> int:
     """Process a single patch and generate champion stats"""
     log(f"=" * 60)
     log(f"Processing patch: {patch}")
@@ -346,9 +432,28 @@ def process_patch(patch: str) -> int:
         log("No champions met the minimum sample size requirement")
         return 0
-    # Generate tier list
-    tier_list = generate_tier_list(champion_stats)
-    log(f"Generated tier list with {len(tier_list)} champions")
     # Save files locally
     temp_dir = f"/tmp/champ-stats/{patch}"
@@ -368,13 +473,14 @@ def process_patch(patch: str) -> int:
     # Save tier list
     tier_list_path = f"{temp_dir}/tier-list.json"
-    with open(tier_list_path, 'w') as f:
-        json.dump({
-            "patch": patch,
-            "generated_at": datetime.now().isoformat(),
-            "total_champions": len(tier_list),
-            "tiers": tier_list,
-        }, f, indent=2)
     tier_op = build_upload_operation(tier_list_path, f"champ-stats/{patch}/tier-list.json")
     if tier_op:
@@ -382,14 +488,14 @@ def process_patch(patch: str) -> int:
     # Save patch metadata
     meta_path = f"{temp_dir}/meta.json"
-    with open(meta_path, 'w') as f:
-        json.dump({
-            "patch": patch,
-            "generated_at": datetime.now().isoformat(),
-            "champions_count": len(champion_stats),
-            "total_games": int(df['sample_size'].sum()) if 'sample_size' in df.columns else 0,
-            "min_sample_size": MIN_SAMPLE_SIZE,
-        }, f, indent=2)
     meta_op = build_upload_operation(meta_path, f"champ-stats/{patch}/meta.json")
     if meta_op:
@@ -406,13 +512,14 @@ def process_patch(patch: str) -> int:
     return 0
-def run_processing_cycle():
     """Run a complete processing cycle for latest patches"""
     global stats, last_processing
-    log("=" * 60)
-    log("STARTING PROCESSING CYCLE")
-    log("=" * 60)
     # Get latest patches
     patches = get_latest_patches(n=3)
@@ -521,11 +628,13 @@ def health():
                 "champions_processed": stats["champions_processed"],
                 "patches_processed": stats["patches_processed"],
             },
-            "config": {
-                "process_interval_seconds": PROCESS_INTERVAL_SECONDS,
-                "min_sample_size": MIN_SAMPLE_SIZE,
-            }
-        }
 @app.get("/trigger")

 - Creates champ-stats/{patch}/{champion}.json files
 """
+import os
+import json
 import time
 import re
 import threading
 HF_CFG = get_hf_config()
 HF_TOKEN = HF_CFG.token
 DATASET_REPO = HF_CFG.dataset_repo
+PROCESS_INTERVAL_SECONDS = max(60, int(os.environ.get("PROCESS_INTERVAL_SECONDS", "3600")))
+MIN_SAMPLE_SIZE = int(os.environ.get("MIN_SAMPLE_SIZE", "100"))
+DATASET_FILE_CACHE_SECONDS = max(30, int(os.environ.get("DATASET_FILE_CACHE_SECONDS", "300")))
+TIER_MIN_GAMES = max(1, int(os.environ.get("TIER_MIN_GAMES", "500")))
+TIER_CALIBRATION_MODE = str(os.environ.get("TIER_CALIBRATION_MODE", "quantile")).strip().lower()
+TIER_STATIC_THRESHOLDS = (
+    float(os.environ.get("TIER_STATIC_S_MIN_WR", "0.54")),
+    float(os.environ.get("TIER_STATIC_A_MIN_WR", "0.52")),
+    float(os.environ.get("TIER_STATIC_B_MIN_WR", "0.50")),
+    float(os.environ.get("TIER_STATIC_C_MIN_WR", "0.48")),
+)
+RANKS = [
+    "IRON", "BRONZE", "SILVER", "GOLD", "PLATINUM",
+    "EMERALD", "DIAMOND", "MASTER", "GRANDMASTER", "CHALLENGER"
+]
 # Global state
 is_running = True
     "last_processing_per_patch": {},
     "processing_history": []
 }
+state_lock = threading.Lock()
+dataset_file_cache_lock = threading.Lock()
+dataset_file_cache = {
+    "timestamp": 0.0,
+    "files": [],
+}
 app = FastAPI(title="ArcaThread Processor v1.0")
     print(f"[{timestamp}] {msg}", flush=True)
+def _normalize_patch_token(value: str) -> Optional[str]:
     """Extract major.minor from patch string"""
     text = str(value or "").strip()
     match = re.match(r"^(\d+)\.(\d+)", text)
     if not match:
         return None
+    return f"{match.group(1)}.{match.group(2)}"
+def list_dataset_files(force_refresh: bool = False) -> List[str]:
+    """List dataset files with a short-lived cache."""
+    now = time.time()
+    with dataset_file_cache_lock:
+        cached_files = dataset_file_cache.get("files", [])
+        cached_at = float(dataset_file_cache.get("timestamp", 0.0) or 0.0)
+        if (
+            not force_refresh
+            and cached_files
+            and (now - cached_at) < DATASET_FILE_CACHE_SECONDS
+        ):
+            return list(cached_files)
+    files = list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
+    with dataset_file_cache_lock:
+        dataset_file_cache["files"] = list(files)
+        dataset_file_cache["timestamp"] = now
+    return files
+def load_existing_patch_meta(patch: str) -> Optional[Dict[str, Any]]:
+    """Load existing meta for a patch if present."""
+    meta_path = f"champ-stats/{patch}/meta.json"
+    try:
+        local_path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename=meta_path,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            local_dir="/tmp",
+        )
+        with open(local_path, "r", encoding="utf-8") as handle:
+            payload = json.load(handle)
+            if isinstance(payload, dict):
+                return payload
+    except Exception:
+        return None
+    return None
+def load_matchup_data_for_patch(patch: str) -> pd.DataFrame:
     """Load all matchup data for a specific patch across all ranks"""
     log(f"Loading matchup data for patch {patch}...")
     try:
+        all_files = list_dataset_files()
         # Filter for this patch's matchup files
         patch_files = [
         return pd.DataFrame()
+def get_latest_patches(n: int = 3) -> List[str]:
     """Get the n latest patches from the dataset"""
     try:
+        all_files = list_dataset_files()
         patches = set()
         for f in all_files:
     return result
+def _resolve_tier_thresholds(win_rates: List[float]) -> tuple:
+    """
+    Resolve tier thresholds.
+    - quantile mode: patch-adaptive cutoffs from current win-rate distribution.
+    - static mode: fixed win-rate cutoffs.
+    """
+    if TIER_CALIBRATION_MODE == "quantile" and len(win_rates) >= 10:
+        quantiles = np.quantile(np.asarray(win_rates, dtype=np.float32), [0.8, 0.6, 0.4, 0.2])
+        s_min, a_min, b_min, c_min = [float(v) for v in quantiles]
+        return s_min, a_min, b_min, c_min, "quantile"
+    s_min, a_min, b_min, c_min = TIER_STATIC_THRESHOLDS
+    return float(s_min), float(a_min), float(b_min), float(c_min), "static"
+def _assign_tier(win_rate: float, thresholds: tuple) -> str:
+    s_min, a_min, b_min, c_min = thresholds
+    if win_rate >= s_min:
+        return "S"
+    if win_rate >= a_min:
+        return "A"
+    if win_rate >= b_min:
+        return "B"
+    if win_rate >= c_min:
+        return "C"
+    return "D"
+def generate_tier_list(
+    stats_by_champion: Dict[str, Dict],
+    min_games: Optional[int] = None
+) -> tuple[List[Dict], Dict[str, Any]]:
+    """Generate tier list from champion stats with explicit calibration metadata."""
+    minimum_games = max(1, int(min_games if min_games is not None else TIER_MIN_GAMES))
+    candidates = [
+        data for data in stats_by_champion.values()
+        if int(data.get("total_games", 0) or 0) >= minimum_games
+    ]
+    if not candidates:
+        calibration = {
+            "mode": "none",
+            "min_games": minimum_games,
+            "thresholds": {"S": None, "A": None, "B": None, "C": None},
+            "eligible_champions": 0,
+        }
+        return [], calibration
+    win_rates = [float(data.get("win_rate", 0.5) or 0.5) for data in candidates]
+    s_min, a_min, b_min, c_min, used_mode = _resolve_tier_thresholds(win_rates)
+    thresholds = (s_min, a_min, b_min, c_min)
+    tiers = []
+    for data in candidates:
+        win_rate = float(data.get("win_rate", 0.5) or 0.5)
+        tier = _assign_tier(win_rate, thresholds)
+        tiers.append({
+            "champion_id": int(data.get("champion_id", 0) or 0),
+            "tier": tier,
+            "win_rate": win_rate,
+            "games": int(data.get("total_games", 0) or 0),
+        })
+    tiers.sort(key=lambda x: x["win_rate"], reverse=True)
+    calibration = {
+        "mode": used_mode,
+        "min_games": minimum_games,
+        "thresholds": {
+            "S": round(s_min, 4),
+            "A": round(a_min, 4),
+            "B": round(b_min, 4),
+            "C": round(c_min, 4),
+        },
+        "eligible_champions": len(candidates),
+    }
+    return tiers, calibration
 def build_upload_operation(local_path: str, repo_path: str) -> Optional[CommitOperationAdd]:
         return False
+def process_patch(patch: str) -> int:
     """Process a single patch and generate champion stats"""
     log(f"=" * 60)
     log(f"Processing patch: {patch}")
         log("No champions met the minimum sample size requirement")
         return 0
+    # Generate tier list
+    tier_list, tier_calibration = generate_tier_list(champion_stats)
+    log(f"Generated tier list with {len(tier_list)} champions")
+    total_games = int(df['sample_size'].sum()) if 'sample_size' in df.columns else 0
+    meta_core = {
+        "patch": patch,
+        "champions_count": len(champion_stats),
+        "total_games": total_games,
+        "min_sample_size": MIN_SAMPLE_SIZE,
+    }
+    existing_meta = load_existing_patch_meta(patch)
+    if existing_meta:
+        existing_core = {
+            "patch": str(existing_meta.get("patch", "")),
+            "champions_count": int(existing_meta.get("champions_count", -1) or -1),
+            "total_games": int(existing_meta.get("total_games", -1) or -1),
+            "min_sample_size": int(existing_meta.get("min_sample_size", -1) or -1),
+        }
+        if existing_core == meta_core:
+            log(f"No material changes for patch {patch}; skipping upload")
+            return len(champion_stats)
     # Save files locally
     temp_dir = f"/tmp/champ-stats/{patch}"
     # Save tier list
     tier_list_path = f"{temp_dir}/tier-list.json"
+    with open(tier_list_path, 'w') as f:
+        json.dump({
+            "patch": patch,
+            "generated_at": datetime.now().isoformat(),
+            "total_champions": len(tier_list),
+            "calibration": tier_calibration,
+            "tiers": tier_list,
+        }, f, indent=2)
     tier_op = build_upload_operation(tier_list_path, f"champ-stats/{patch}/tier-list.json")
     if tier_op:
     # Save patch metadata
     meta_path = f"{temp_dir}/meta.json"
+    with open(meta_path, 'w') as f:
+        json.dump({
+            "patch": patch,
+            "generated_at": datetime.now().isoformat(),
+            "champions_count": len(champion_stats),
+            "total_games": total_games,
+            "min_sample_size": MIN_SAMPLE_SIZE,
+        }, f, indent=2)
     meta_op = build_upload_operation(meta_path, f"champ-stats/{patch}/meta.json")
     if meta_op:
     return 0
+def run_processing_cycle():
     """Run a complete processing cycle for latest patches"""
     global stats, last_processing
+    log("=" * 60)
+    log("STARTING PROCESSING CYCLE")
+    log("=" * 60)
+    list_dataset_files(force_refresh=True)
     # Get latest patches
     patches = get_latest_patches(n=3)
                 "champions_processed": stats["champions_processed"],
                 "patches_processed": stats["patches_processed"],
             },
+            "config": {
+                "process_interval_seconds": PROCESS_INTERVAL_SECONDS,
+                "min_sample_size": MIN_SAMPLE_SIZE,
+                "tier_min_games": TIER_MIN_GAMES,
+                "tier_calibration_mode": TIER_CALIBRATION_MODE,
+            }
+        }
 @app.get("/trigger")