Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python3 | |
| """MANIFOLD Training Interface for Hugging Face Spaces with ZeroGPU.""" | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import json | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| from datetime import datetime | |
| import spaces | |
| import sys | |
| sys.path.insert(0, str(Path(__file__).parent / "src")) | |
| from manifold import MANIFOLDLite | |
| from manifold.config import ModelConfig, TrainingConfig | |
| from manifold.data.generator import SyntheticDataGenerator | |
| from manifold.data.dataset import MANIFOLDDataset, create_dataloader | |
| from manifold.training.trainer import train_epoch, validate | |
| from manifold.training.curriculum import CurriculumScheduler | |
| from manifold.training.losses import compute_total_loss | |
| current_model = None | |
| DATASET_REPO = "LimmeDev/manifold-synthetic-data" | |
| def get_device_info(): | |
| if torch.cuda.is_available(): | |
| return f"GPU: {torch.cuda.get_device_name(0)} ({torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB)" | |
| return "CPU (GPU will be allocated when training starts)" | |
| def contribute_to_dataset(features, labels, num_legit, num_cheaters, seed): | |
| try: | |
| from huggingface_hub import HfApi | |
| import tempfile | |
| import os | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if not hf_token: | |
| return False, "HF_TOKEN not configured" | |
| api = HfApi(token=hf_token) | |
| contribution_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}" | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| features_path = os.path.join(tmpdir, f"features_{contribution_id}.npy") | |
| labels_path = os.path.join(tmpdir, f"labels_{contribution_id}.npy") | |
| meta_path = os.path.join(tmpdir, f"meta_{contribution_id}.json") | |
| np.save(features_path, features) | |
| np.save(labels_path, labels) | |
| metadata = { | |
| "contribution_id": contribution_id, | |
| "timestamp": datetime.now().isoformat(), | |
| "num_legit": int(num_legit), | |
| "num_cheaters": int(num_cheaters), | |
| "total_samples": len(labels), | |
| "seed": int(seed), | |
| "features_shape": list(features.shape), | |
| } | |
| with open(meta_path, "w") as f: | |
| json.dump(metadata, f, indent=2) | |
| api.upload_file(path_or_fileobj=features_path, path_in_repo=f"contributions/features_{contribution_id}.npy", repo_id=DATASET_REPO, repo_type="dataset") | |
| api.upload_file(path_or_fileobj=labels_path, path_in_repo=f"contributions/labels_{contribution_id}.npy", repo_id=DATASET_REPO, repo_type="dataset") | |
| api.upload_file(path_or_fileobj=meta_path, path_in_repo=f"contributions/meta_{contribution_id}.json", repo_id=DATASET_REPO, repo_type="dataset") | |
| return True, contribution_id | |
| except Exception as e: | |
| return False, str(e) | |
| def generate_data(num_legit, num_cheaters, seed, contribute, progress=gr.Progress()): | |
| progress(0, desc="Initializing generator...") | |
| generator = SyntheticDataGenerator(seed=int(seed), engagements_per_session=200) | |
| all_features = [] | |
| all_labels = [] | |
| total = num_legit + num_cheaters | |
| for i in progress.tqdm(range(int(num_legit)), desc="Generating legit players"): | |
| session = generator.generate_player(is_cheater=False) | |
| all_features.append(session.to_tensor()) | |
| all_labels.append(0) | |
| for i in progress.tqdm(range(int(num_cheaters)), desc="Generating cheaters"): | |
| session = generator.generate_player(is_cheater=True) | |
| all_features.append(session.to_tensor()) | |
| all_labels.append(2) | |
| features = np.array(all_features) | |
| labels = np.array(all_labels) | |
| rng = np.random.default_rng(int(seed)) | |
| indices = rng.permutation(total) | |
| features = features[indices] | |
| labels = labels[indices] | |
| split_idx = int(total * 0.9) | |
| data_dir = Path("/tmp/manifold_data") | |
| data_dir.mkdir(exist_ok=True) | |
| np.save(data_dir / "train_features.npy", features[:split_idx]) | |
| np.save(data_dir / "train_labels.npy", labels[:split_idx]) | |
| np.save(data_dir / "val_features.npy", features[split_idx:]) | |
| np.save(data_dir / "val_labels.npy", labels[split_idx:]) | |
| status = f"β Generated {total} samples:\n- Train: {split_idx}\n- Val: {total - split_idx}\n- Shape: {features.shape}" | |
| if contribute: | |
| progress(0.95, desc="Contributing to community dataset...") | |
| success, result = contribute_to_dataset(features, labels, num_legit, num_cheaters, seed) | |
| if success: | |
| status += f"\n\nπ Contributed to community dataset! ID: {result}" | |
| else: | |
| status += f"\n\nβ οΈ Dataset contribution failed: {result}" | |
| return status | |
| def train_model(batch_size, learning_rate, num_epochs): | |
| global current_model | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| gpu_info = f"Using: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "CPU only" | |
| data_dir = Path("/tmp/manifold_data") | |
| if not (data_dir / "train_features.npy").exists(): | |
| return "β No data found! Generate data first.", "" | |
| train_features = np.load(data_dir / "train_features.npy") | |
| train_labels = np.load(data_dir / "train_labels.npy") | |
| val_features = np.load(data_dir / "val_features.npy") | |
| val_labels = np.load(data_dir / "val_labels.npy") | |
| train_dataset = MANIFOLDDataset(data=train_features, labels=train_labels) | |
| val_dataset = MANIFOLDDataset(data=val_features, labels=val_labels) | |
| actual_batch = min(int(batch_size), len(train_dataset)) | |
| from torch.utils.data import DataLoader | |
| train_loader = DataLoader(train_dataset, batch_size=actual_batch, shuffle=True, num_workers=0, drop_last=False, pin_memory=False) | |
| val_loader = DataLoader(val_dataset, batch_size=actual_batch, shuffle=False, num_workers=0, drop_last=False, pin_memory=False) | |
| model = MANIFOLDLite.from_config(ModelConfig()) | |
| model = model.to(device) | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01) | |
| scaler = torch.amp.GradScaler(enabled=torch.cuda.is_available()) | |
| scheduler = CurriculumScheduler() | |
| logs = [] | |
| logs.append(f"π {gpu_info}") | |
| logs.append(f"π Train: {len(train_dataset)}, Val: {len(val_dataset)}") | |
| logs.append(f"π§ Params: {model.get_num_params():,}") | |
| logs.append("-" * 40) | |
| global_step = 0 | |
| for epoch in range(int(num_epochs)): | |
| stage_config = scheduler.get_stage_config() | |
| for pg in optimizer.param_groups: | |
| pg["lr"] = stage_config["learning_rate"] | |
| model.train() | |
| train_loss = 0 | |
| for batch in train_loader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| mask = batch.get("mask") | |
| if mask is not None: | |
| mask = mask.bool() | |
| with torch.amp.autocast(device_type='cuda', dtype=torch.float16, enabled=torch.cuda.is_available()): | |
| outputs = model(batch["features"], mask=mask, active_components=stage_config.get("components")) | |
| loss, _ = compute_total_loss(outputs, {"labels": batch["labels"]}, stage_config["losses"], global_step) | |
| scaler.scale(loss).backward() | |
| scaler.unscale_(optimizer) | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| scaler.step(optimizer) | |
| scaler.update() | |
| optimizer.zero_grad(set_to_none=True) | |
| train_loss += loss.item() | |
| global_step += 1 | |
| train_loss /= len(train_loader) | |
| model.eval() | |
| val_loss = 0 | |
| correct = 0 | |
| total = 0 | |
| with torch.no_grad(): | |
| for batch in val_loader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| mask = batch.get("mask") | |
| if mask is not None: | |
| mask = mask.bool() | |
| outputs = model(batch["features"], mask=mask, active_components=stage_config.get("components")) | |
| loss, _ = compute_total_loss(outputs, {"labels": batch["labels"]}, stage_config["losses"]) | |
| val_loss += loss.item() | |
| if "predicted_class" in outputs: | |
| correct += (outputs["predicted_class"] == batch["labels"]).sum().item() | |
| total += batch["labels"].size(0) | |
| val_loss = val_loss / len(val_loader) if len(val_loader) > 0 else 0 | |
| val_acc = correct / total if total > 0 else 0 | |
| step_info = scheduler.step_epoch() | |
| stage_name = step_info["stage_name"].split(":")[0] if ":" in step_info["stage_name"] else step_info["stage_name"] | |
| logs.append(f"Epoch {epoch+1:2d} | {stage_name:8s} | Loss: {train_loss:.4f} / {val_loss:.4f} | Acc: {val_acc:.4f}") | |
| if step_info.get("stage_changed"): | |
| logs.append(f" β Advanced to {scheduler.current_stage.name}") | |
| save_path = Path("/tmp/manifold_model.pt") | |
| torch.save({"model_state_dict": model.state_dict(), "config": ModelConfig()}, save_path) | |
| current_model = model.cpu() | |
| logs.append("-" * 40) | |
| logs.append(f"β Training complete! Final val accuracy: {val_acc:.4f}") | |
| return "β Training complete!", "\n".join(logs) | |
| def test_inference(num_samples): | |
| global current_model | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if current_model is None: | |
| model_path = Path("/tmp/manifold_model.pt") | |
| if model_path.exists(): | |
| current_model = MANIFOLDLite.from_config(ModelConfig()) | |
| ckpt = torch.load(model_path, map_location="cpu") | |
| current_model.load_state_dict(ckpt["model_state_dict"]) | |
| else: | |
| return "β No model! Train first." | |
| model = current_model.to(device) | |
| model.eval() | |
| generator = SyntheticDataGenerator(seed=99999) | |
| results = [] | |
| for i in range(int(num_samples)): | |
| is_cheater = i % 2 == 1 | |
| session = generator.generate_player(is_cheater=is_cheater) | |
| features = torch.tensor(session.to_tensor(), dtype=torch.float32).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| outputs = model(features) | |
| pred = outputs["predicted_class"].item() | |
| conf = outputs["verdict_probs"][0].max().item() | |
| unc = outputs["uncertainty"].item() | |
| classes = ["Clean", "Suspicious", "Cheating"] | |
| actual = "Cheater" if is_cheater else "Legit" | |
| correct = "β" if (pred > 0) == is_cheater else "β" | |
| results.append(f"| {i+1} | {actual} | {classes[pred]} | {conf:.1%} | {unc:.3f} | {correct} |") | |
| current_model = model.cpu() | |
| header = "| # | Actual | Predicted | Conf | Uncert | β/β |\n|---|--------|-----------|------|--------|-----|" | |
| correct_count = sum(1 for r in results if "β" in r) | |
| footer = f"\n\n**Accuracy: {correct_count}/{num_samples} ({100*correct_count/num_samples:.1f}%)**" | |
| return header + "\n" + "\n".join(results) + footer | |
| with gr.Blocks(title="MANIFOLD Training", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π― MANIFOLD - CS2 Cheat Detection") | |
| gr.Markdown(f"**{get_device_info()}** | ZeroGPU will allocate H200 on demand") | |
| with gr.Tabs(): | |
| with gr.TabItem("1οΈβ£ Generate Data"): | |
| gr.Markdown("Generate synthetic CS2 player data") | |
| with gr.Row(): | |
| num_legit = gr.Slider(50, 10000, value=70, step=10, label="Legit Players") | |
| num_cheaters = gr.Slider(20, 5000, value=30, step=10, label="Cheaters") | |
| seed = gr.Number(value=42, label="Seed") | |
| gr.Markdown("---") | |
| contribute_checkbox = gr.Checkbox( | |
| value=False, | |
| label="π Contribute to Community Dataset", | |
| info="I agree to contribute this synthetic data to the public MANIFOLD dataset on Hugging Face. This data is purely synthetic and contains no personal information." | |
| ) | |
| gen_btn = gr.Button("π² Generate Data", variant="primary") | |
| gen_output = gr.Textbox(label="Status", lines=5) | |
| gen_btn.click(generate_data, [num_legit, num_cheaters, seed, contribute_checkbox], gen_output) | |
| with gr.TabItem("2οΈβ£ Train Model"): | |
| gr.Markdown("Train with 4-stage curriculum learning (ZeroGPU: 5 min limit)") | |
| with gr.Row(): | |
| batch_size = gr.Slider(16, 128, value=64, step=16, label="Batch Size") | |
| lr = gr.Number(value=3e-4, label="Learning Rate") | |
| epochs = gr.Slider(5, 50, value=15, step=5, label="Epochs") | |
| train_btn = gr.Button("π Start Training", variant="primary") | |
| train_status = gr.Textbox(label="Status", lines=2) | |
| train_logs = gr.Textbox(label="Training Logs", lines=15) | |
| train_btn.click(train_model, [batch_size, lr, epochs], [train_status, train_logs]) | |
| with gr.TabItem("3οΈβ£ Test Model"): | |
| gr.Markdown("Test on synthetic samples") | |
| num_test = gr.Slider(5, 30, value=10, step=5, label="Test Samples") | |
| test_btn = gr.Button("π Run Inference", variant="primary") | |
| test_output = gr.Markdown() | |
| test_btn.click(test_inference, [num_test], test_output) | |
| gr.Markdown("---\n*MANIFOLD: Motor-Aware Neural Inference for Faithfulness Of Latent Dynamics*") | |
| if __name__ == "__main__": | |
| demo.launch() | |