Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone, timedelta | |
| from typing import Dict, List, Tuple | |
| import pandas as pd | |
| from datasets import Dataset, DatasetDict | |
| from dotenv import load_dotenv | |
| TOTAL_USERS = 50 | |
| RECORDS_PER_USER = 50 | |
| USER_EMB_DIM = 12 | |
| class UserProfile: | |
| user_id: str | |
| session_prefix: str | |
| base_time: datetime | |
| acc_mean: Tuple[float, float, float] | |
| gyro_mean: Tuple[float, float, float] | |
| linacc_mean: Tuple[float, float, float] | |
| gravity_mean: Tuple[float, float, float] | |
| acc_std: Tuple[float, float, float] | |
| gyro_std: Tuple[float, float, float] | |
| rms_base: float | |
| rms_gyro_base: float | |
| mean_freq_acc: float | |
| mean_freq_gyro: float | |
| entropy_acc: float | |
| entropy_gyro: float | |
| jerk_mean: float | |
| jerk_std: float | |
| stability_index: float | |
| freq_base: float | |
| user_emb: List[float] | |
| fatigue_base: float | |
| def require_env(var_name: str) -> str: | |
| value = os.getenv(var_name) | |
| if not value: | |
| raise RuntimeError(f"νκ²½λ³μ {var_name}κ° νμν©λλ€.") | |
| return value | |
| def random_vector(dim: int, scale: float = 1.0) -> List[float]: | |
| return [round(random.uniform(-scale, scale), 4) for _ in range(dim)] | |
| def generate_user_profile(user_idx: int, start_time: datetime) -> UserProfile: | |
| user_id = f"user_{user_idx:03d}" | |
| session_prefix = f"{user_id}_session" | |
| def triple(base_scale: float) -> Tuple[float, float, float]: | |
| return tuple(round(random.uniform(-base_scale, base_scale), 4) for _ in range(3)) | |
| def positive_triple(low: float, high: float) -> Tuple[float, float, float]: | |
| return tuple(round(random.uniform(low, high), 4) for _ in range(3)) | |
| profile = UserProfile( | |
| user_id=user_id, | |
| session_prefix=session_prefix, | |
| base_time=start_time + timedelta(minutes=random.uniform(0, 5)), | |
| acc_mean=triple(0.2), | |
| gyro_mean=triple(0.05), | |
| linacc_mean=triple(0.3), | |
| gravity_mean=(round(random.uniform(-0.05, 0.05), 4), | |
| round(random.uniform(-0.05, 0.05), 4), | |
| round(random.uniform(0.9, 1.1), 4)), | |
| acc_std=positive_triple(0.2, 0.6), | |
| gyro_std=positive_triple(0.02, 0.08), | |
| rms_base=round(random.uniform(0.3, 1.0), 4), | |
| rms_gyro_base=round(random.uniform(0.05, 0.2), 4), | |
| mean_freq_acc=round(random.uniform(25, 55), 2), | |
| mean_freq_gyro=round(random.uniform(10, 25), 2), | |
| entropy_acc=round(random.uniform(0.3, 0.8), 4), | |
| entropy_gyro=round(random.uniform(0.3, 0.7), 4), | |
| jerk_mean=round(random.uniform(-0.2, 0.2), 4), | |
| jerk_std=round(random.uniform(0.02, 0.08), 4), | |
| stability_index=round(random.uniform(0.6, 0.95), 4), | |
| freq_base=round(random.uniform(30, 55), 2), | |
| user_emb=random_vector(USER_EMB_DIM, scale=0.5), | |
| fatigue_base=round(random.uniform(0.25, 0.6), 4), | |
| ) | |
| return profile | |
| def add_noise(value: float, noise_scale: float) -> float: | |
| return round(value + random.uniform(-noise_scale, noise_scale), 4) | |
| def bounded(value: float, low: float, high: float) -> float: | |
| return max(low, min(high, value)) | |
| def random_record( | |
| profile: UserProfile, | |
| record_idx: int, | |
| prev_fatigue: float, | |
| ) -> Tuple[dict, float]: | |
| window_start_ms = record_idx * 2000 | |
| window_end_ms = window_start_ms + 2000 | |
| base_time = profile.base_time + timedelta(milliseconds=window_start_ms) | |
| def rand_float(scale: float = 1.0) -> float: | |
| return round(random.uniform(-scale, scale), 4) | |
| fatigue_delta = random.uniform(-0.05, 0.1) | |
| fatigue = bounded(prev_fatigue + fatigue_delta, 0.05, 0.95) | |
| record = { | |
| "user_id": profile.user_id, | |
| "session_id": f"{profile.session_prefix}_{record_idx:03d}", | |
| "window_id": record_idx, | |
| "window_start_ms": window_start_ms, | |
| "window_end_ms": window_end_ms, | |
| "timestamp_utc": base_time.replace(tzinfo=timezone.utc).isoformat(), | |
| "acc_x_mean": add_noise(profile.acc_mean[0], 0.05), | |
| "acc_y_mean": add_noise(profile.acc_mean[1], 0.05), | |
| "acc_z_mean": add_noise(profile.acc_mean[2], 0.05), | |
| "gyro_x_mean": add_noise(profile.gyro_mean[0], 0.01), | |
| "gyro_y_mean": add_noise(profile.gyro_mean[1], 0.01), | |
| "gyro_z_mean": add_noise(profile.gyro_mean[2], 0.01), | |
| "linacc_x_mean": add_noise(profile.linacc_mean[0], 0.07), | |
| "linacc_y_mean": add_noise(profile.linacc_mean[1], 0.07), | |
| "linacc_z_mean": add_noise(profile.linacc_mean[2], 0.07), | |
| "gravity_x_mean": add_noise(profile.gravity_mean[0], 0.005), | |
| "gravity_y_mean": add_noise(profile.gravity_mean[1], 0.005), | |
| "gravity_z_mean": add_noise(profile.gravity_mean[2], 0.02), | |
| "acc_x_std": add_noise(profile.acc_std[0], 0.05), | |
| "acc_y_std": add_noise(profile.acc_std[1], 0.05), | |
| "acc_z_std": add_noise(profile.acc_std[2], 0.05), | |
| "gyro_x_std": add_noise(profile.gyro_std[0], 0.005), | |
| "gyro_y_std": add_noise(profile.gyro_std[1], 0.005), | |
| "gyro_z_std": add_noise(profile.gyro_std[2], 0.005), | |
| "rms_acc": add_noise(profile.rms_base, 0.1), | |
| "rms_gyro": add_noise(profile.rms_gyro_base, 0.02), | |
| "mean_freq_acc": round(add_noise(profile.mean_freq_acc, 1.5), 2), | |
| "mean_freq_gyro": round(add_noise(profile.mean_freq_gyro, 0.8), 2), | |
| "entropy_acc": add_noise(profile.entropy_acc, 0.05), | |
| "entropy_gyro": add_noise(profile.entropy_gyro, 0.05), | |
| "jerk_mean": add_noise(profile.jerk_mean, 0.02), | |
| "jerk_std": add_noise(profile.jerk_std, 0.01), | |
| "stability_index": bounded(add_noise(profile.stability_index, 0.03), 0.4, 0.99), | |
| "rms_base": profile.rms_base, | |
| "freq_base": profile.freq_base, | |
| "user_emb": profile.user_emb, | |
| "fatigue_prev": round(prev_fatigue, 4), | |
| "fatigue": round(fatigue, 4), | |
| "fatigue_level": 0 if fatigue < 0.3 else 1 if fatigue < 0.6 else 2, | |
| "quality_flag": 1 if random.random() > 0.05 else 0, | |
| "window_size_ms": 2000, | |
| "overlap_rate": 0.5 + rand_float(0.05), | |
| } | |
| return record, fatigue | |
| def generate_dataset_dict() -> DatasetDict: | |
| datasets_by_user: Dict[str, Dataset] = {} | |
| start_time = datetime.utcnow() | |
| for user_idx in range(1, TOTAL_USERS + 1): | |
| profile = generate_user_profile(user_idx, start_time) | |
| rows = [] | |
| prev_fatigue = profile.fatigue_base | |
| for record_idx in range(RECORDS_PER_USER): | |
| record, prev_fatigue = random_record(profile, record_idx, prev_fatigue) | |
| rows.append(record) | |
| df = pd.DataFrame(rows) | |
| datasets_by_user[profile.user_id] = Dataset.from_pandas(df, preserve_index=False) | |
| return DatasetDict(datasets_by_user) | |
| def main(): | |
| load_dotenv() | |
| repo_id = require_env("HF_DATA_REPO_ID") | |
| token = require_env("HF_DATA_TOKEN") | |
| print(f"π¦ Generating synthetic dataset: users={TOTAL_USERS}, records/user={RECORDS_PER_USER}") | |
| dataset_dict = generate_dataset_dict() | |
| total_records = sum(len(dataset_dict[user_id]) for user_id in dataset_dict) | |
| print(f"π’ Total records: {total_records}") | |
| print(f"π€ Pushing DatasetDict ({len(dataset_dict)} users) to Hugging Face: {repo_id}") | |
| dataset_dict.push_to_hub(repo_id, token=token, private=True) | |
| print("β Upload complete") | |
| if __name__ == "__main__": | |
| main() | |