import os import random from dataclasses import dataclass from datetime import datetime, timezone, timedelta from typing import Dict, List, Tuple import pandas as pd from datasets import Dataset, DatasetDict from dotenv import load_dotenv TOTAL_USERS = 50 RECORDS_PER_USER = 50 USER_EMB_DIM = 12 @dataclass class UserProfile: user_id: str session_prefix: str base_time: datetime acc_mean: Tuple[float, float, float] gyro_mean: Tuple[float, float, float] linacc_mean: Tuple[float, float, float] gravity_mean: Tuple[float, float, float] acc_std: Tuple[float, float, float] gyro_std: Tuple[float, float, float] rms_base: float rms_gyro_base: float mean_freq_acc: float mean_freq_gyro: float entropy_acc: float entropy_gyro: float jerk_mean: float jerk_std: float stability_index: float freq_base: float user_emb: List[float] fatigue_base: float def require_env(var_name: str) -> str: value = os.getenv(var_name) if not value: raise RuntimeError(f"환경변수 {var_name}가 필요합니다.") return value def random_vector(dim: int, scale: float = 1.0) -> List[float]: return [round(random.uniform(-scale, scale), 4) for _ in range(dim)] def generate_user_profile(user_idx: int, start_time: datetime) -> UserProfile: user_id = f"user_{user_idx:03d}" session_prefix = f"{user_id}_session" def triple(base_scale: float) -> Tuple[float, float, float]: return tuple(round(random.uniform(-base_scale, base_scale), 4) for _ in range(3)) def positive_triple(low: float, high: float) -> Tuple[float, float, float]: return tuple(round(random.uniform(low, high), 4) for _ in range(3)) profile = UserProfile( user_id=user_id, session_prefix=session_prefix, base_time=start_time + timedelta(minutes=random.uniform(0, 5)), acc_mean=triple(0.2), gyro_mean=triple(0.05), linacc_mean=triple(0.3), gravity_mean=(round(random.uniform(-0.05, 0.05), 4), round(random.uniform(-0.05, 0.05), 4), round(random.uniform(0.9, 1.1), 4)), acc_std=positive_triple(0.2, 0.6), gyro_std=positive_triple(0.02, 0.08), rms_base=round(random.uniform(0.3, 1.0), 4), rms_gyro_base=round(random.uniform(0.05, 0.2), 4), mean_freq_acc=round(random.uniform(25, 55), 2), mean_freq_gyro=round(random.uniform(10, 25), 2), entropy_acc=round(random.uniform(0.3, 0.8), 4), entropy_gyro=round(random.uniform(0.3, 0.7), 4), jerk_mean=round(random.uniform(-0.2, 0.2), 4), jerk_std=round(random.uniform(0.02, 0.08), 4), stability_index=round(random.uniform(0.6, 0.95), 4), freq_base=round(random.uniform(30, 55), 2), user_emb=random_vector(USER_EMB_DIM, scale=0.5), fatigue_base=round(random.uniform(0.25, 0.6), 4), ) return profile def add_noise(value: float, noise_scale: float) -> float: return round(value + random.uniform(-noise_scale, noise_scale), 4) def bounded(value: float, low: float, high: float) -> float: return max(low, min(high, value)) def random_record( profile: UserProfile, record_idx: int, prev_fatigue: float, ) -> Tuple[dict, float]: window_start_ms = record_idx * 2000 window_end_ms = window_start_ms + 2000 base_time = profile.base_time + timedelta(milliseconds=window_start_ms) def rand_float(scale: float = 1.0) -> float: return round(random.uniform(-scale, scale), 4) fatigue_delta = random.uniform(-0.05, 0.1) fatigue = bounded(prev_fatigue + fatigue_delta, 0.05, 0.95) record = { "user_id": profile.user_id, "session_id": f"{profile.session_prefix}_{record_idx:03d}", "window_id": record_idx, "window_start_ms": window_start_ms, "window_end_ms": window_end_ms, "timestamp_utc": base_time.replace(tzinfo=timezone.utc).isoformat(), "acc_x_mean": add_noise(profile.acc_mean[0], 0.05), "acc_y_mean": add_noise(profile.acc_mean[1], 0.05), "acc_z_mean": add_noise(profile.acc_mean[2], 0.05), "gyro_x_mean": add_noise(profile.gyro_mean[0], 0.01), "gyro_y_mean": add_noise(profile.gyro_mean[1], 0.01), "gyro_z_mean": add_noise(profile.gyro_mean[2], 0.01), "linacc_x_mean": add_noise(profile.linacc_mean[0], 0.07), "linacc_y_mean": add_noise(profile.linacc_mean[1], 0.07), "linacc_z_mean": add_noise(profile.linacc_mean[2], 0.07), "gravity_x_mean": add_noise(profile.gravity_mean[0], 0.005), "gravity_y_mean": add_noise(profile.gravity_mean[1], 0.005), "gravity_z_mean": add_noise(profile.gravity_mean[2], 0.02), "acc_x_std": add_noise(profile.acc_std[0], 0.05), "acc_y_std": add_noise(profile.acc_std[1], 0.05), "acc_z_std": add_noise(profile.acc_std[2], 0.05), "gyro_x_std": add_noise(profile.gyro_std[0], 0.005), "gyro_y_std": add_noise(profile.gyro_std[1], 0.005), "gyro_z_std": add_noise(profile.gyro_std[2], 0.005), "rms_acc": add_noise(profile.rms_base, 0.1), "rms_gyro": add_noise(profile.rms_gyro_base, 0.02), "mean_freq_acc": round(add_noise(profile.mean_freq_acc, 1.5), 2), "mean_freq_gyro": round(add_noise(profile.mean_freq_gyro, 0.8), 2), "entropy_acc": add_noise(profile.entropy_acc, 0.05), "entropy_gyro": add_noise(profile.entropy_gyro, 0.05), "jerk_mean": add_noise(profile.jerk_mean, 0.02), "jerk_std": add_noise(profile.jerk_std, 0.01), "stability_index": bounded(add_noise(profile.stability_index, 0.03), 0.4, 0.99), "rms_base": profile.rms_base, "freq_base": profile.freq_base, "user_emb": profile.user_emb, "fatigue_prev": round(prev_fatigue, 4), "fatigue": round(fatigue, 4), "fatigue_level": 0 if fatigue < 0.3 else 1 if fatigue < 0.6 else 2, "quality_flag": 1 if random.random() > 0.05 else 0, "window_size_ms": 2000, "overlap_rate": 0.5 + rand_float(0.05), } return record, fatigue def generate_dataset_dict() -> DatasetDict: datasets_by_user: Dict[str, Dataset] = {} start_time = datetime.utcnow() for user_idx in range(1, TOTAL_USERS + 1): profile = generate_user_profile(user_idx, start_time) rows = [] prev_fatigue = profile.fatigue_base for record_idx in range(RECORDS_PER_USER): record, prev_fatigue = random_record(profile, record_idx, prev_fatigue) rows.append(record) df = pd.DataFrame(rows) datasets_by_user[profile.user_id] = Dataset.from_pandas(df, preserve_index=False) return DatasetDict(datasets_by_user) def main(): load_dotenv() repo_id = require_env("HF_DATA_REPO_ID") token = require_env("HF_DATA_TOKEN") print(f"📦 Generating synthetic dataset: users={TOTAL_USERS}, records/user={RECORDS_PER_USER}") dataset_dict = generate_dataset_dict() total_records = sum(len(dataset_dict[user_id]) for user_id in dataset_dict) print(f"🔢 Total records: {total_records}") print(f"📤 Pushing DatasetDict ({len(dataset_dict)} users) to Hugging Face: {repo_id}") dataset_dict.push_to_hub(repo_id, token=token, private=True) print("✅ Upload complete") if __name__ == "__main__": main()