Spaces:

Merry99
/

MuscleCare-FastAPI

Sleeping

App Files Files Community

Merry99 commited on Nov 8, 2025

Commit

a9f6dc6

1 Parent(s): a811cc3

add generate_dataset

Browse files

Files changed (1) hide show

generate_dataset.py +197 -0

generate_dataset.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import random
+from dataclasses import dataclass
+from datetime import datetime, timezone, timedelta
+from typing import Dict, List, Tuple
+import pandas as pd
+from datasets import Dataset, DatasetDict
+from dotenv import load_dotenv
+TOTAL_USERS = 50
+RECORDS_PER_USER = 50
+USER_EMB_DIM = 12
+@dataclass
+class UserProfile:
+    user_id: str
+    session_prefix: str
+    base_time: datetime
+    acc_mean: Tuple[float, float, float]
+    gyro_mean: Tuple[float, float, float]
+    linacc_mean: Tuple[float, float, float]
+    gravity_mean: Tuple[float, float, float]
+    acc_std: Tuple[float, float, float]
+    gyro_std: Tuple[float, float, float]
+    rms_base: float
+    rms_gyro_base: float
+    mean_freq_acc: float
+    mean_freq_gyro: float
+    entropy_acc: float
+    entropy_gyro: float
+    jerk_mean: float
+    jerk_std: float
+    stability_index: float
+    freq_base: float
+    user_emb: List[float]
+    fatigue_base: float
+def require_env(var_name: str) -> str:
+    value = os.getenv(var_name)
+    if not value:
+        raise RuntimeError(f"환경변수 {var_name}가 필요합니다.")
+    return value
+def random_vector(dim: int, scale: float = 1.0) -> List[float]:
+    return [round(random.uniform(-scale, scale), 4) for _ in range(dim)]
+def generate_user_profile(user_idx: int, start_time: datetime) -> UserProfile:
+    user_id = f"user_{user_idx:03d}"
+    session_prefix = f"{user_id}_session"
+    def triple(base_scale: float) -> Tuple[float, float, float]:
+        return tuple(round(random.uniform(-base_scale, base_scale), 4) for _ in range(3))
+    def positive_triple(low: float, high: float) -> Tuple[float, float, float]:
+        return tuple(round(random.uniform(low, high), 4) for _ in range(3))
+    profile = UserProfile(
+        user_id=user_id,
+        session_prefix=session_prefix,
+        base_time=start_time + timedelta(minutes=random.uniform(0, 5)),
+        acc_mean=triple(0.2),
+        gyro_mean=triple(0.05),
+        linacc_mean=triple(0.3),
+        gravity_mean=(round(random.uniform(-0.05, 0.05), 4),
+                      round(random.uniform(-0.05, 0.05), 4),
+                      round(random.uniform(0.9, 1.1), 4)),
+        acc_std=positive_triple(0.2, 0.6),
+        gyro_std=positive_triple(0.02, 0.08),
+        rms_base=round(random.uniform(0.3, 1.0), 4),
+        rms_gyro_base=round(random.uniform(0.05, 0.2), 4),
+        mean_freq_acc=round(random.uniform(25, 55), 2),
+        mean_freq_gyro=round(random.uniform(10, 25), 2),
+        entropy_acc=round(random.uniform(0.3, 0.8), 4),
+        entropy_gyro=round(random.uniform(0.3, 0.7), 4),
+        jerk_mean=round(random.uniform(-0.2, 0.2), 4),
+        jerk_std=round(random.uniform(0.02, 0.08), 4),
+        stability_index=round(random.uniform(0.6, 0.95), 4),
+        freq_base=round(random.uniform(30, 55), 2),
+        user_emb=random_vector(USER_EMB_DIM, scale=0.5),
+        fatigue_base=round(random.uniform(0.25, 0.6), 4),
+    )
+    return profile
+def add_noise(value: float, noise_scale: float) -> float:
+    return round(value + random.uniform(-noise_scale, noise_scale), 4)
+def bounded(value: float, low: float, high: float) -> float:
+    return max(low, min(high, value))
+def random_record(
+    profile: UserProfile,
+    record_idx: int,
+    prev_fatigue: float,
+) -> Tuple[dict, float]:
+    window_start_ms = record_idx * 2000
+    window_end_ms = window_start_ms + 2000
+    base_time = profile.base_time + timedelta(milliseconds=window_start_ms)
+    def rand_float(scale: float = 1.0) -> float:
+        return round(random.uniform(-scale, scale), 4)
+    fatigue_delta = random.uniform(-0.05, 0.1)
+    fatigue = bounded(prev_fatigue + fatigue_delta, 0.05, 0.95)
+    record = {
+        "user_id": profile.user_id,
+        "session_id": f"{profile.session_prefix}_{record_idx:03d}",
+        "window_id": record_idx,
+        "window_start_ms": window_start_ms,
+        "window_end_ms": window_end_ms,
+        "timestamp_utc": base_time.replace(tzinfo=timezone.utc).isoformat(),
+        "acc_x_mean": add_noise(profile.acc_mean[0], 0.05),
+        "acc_y_mean": add_noise(profile.acc_mean[1], 0.05),
+        "acc_z_mean": add_noise(profile.acc_mean[2], 0.05),
+        "gyro_x_mean": add_noise(profile.gyro_mean[0], 0.01),
+        "gyro_y_mean": add_noise(profile.gyro_mean[1], 0.01),
+        "gyro_z_mean": add_noise(profile.gyro_mean[2], 0.01),
+        "linacc_x_mean": add_noise(profile.linacc_mean[0], 0.07),
+        "linacc_y_mean": add_noise(profile.linacc_mean[1], 0.07),
+        "linacc_z_mean": add_noise(profile.linacc_mean[2], 0.07),
+        "gravity_x_mean": add_noise(profile.gravity_mean[0], 0.005),
+        "gravity_y_mean": add_noise(profile.gravity_mean[1], 0.005),
+        "gravity_z_mean": add_noise(profile.gravity_mean[2], 0.02),
+        "acc_x_std": add_noise(profile.acc_std[0], 0.05),
+        "acc_y_std": add_noise(profile.acc_std[1], 0.05),
+        "acc_z_std": add_noise(profile.acc_std[2], 0.05),
+        "gyro_x_std": add_noise(profile.gyro_std[0], 0.005),
+        "gyro_y_std": add_noise(profile.gyro_std[1], 0.005),
+        "gyro_z_std": add_noise(profile.gyro_std[2], 0.005),
+        "rms_acc": add_noise(profile.rms_base, 0.1),
+        "rms_gyro": add_noise(profile.rms_gyro_base, 0.02),
+        "mean_freq_acc": round(add_noise(profile.mean_freq_acc, 1.5), 2),
+        "mean_freq_gyro": round(add_noise(profile.mean_freq_gyro, 0.8), 2),
+        "entropy_acc": add_noise(profile.entropy_acc, 0.05),
+        "entropy_gyro": add_noise(profile.entropy_gyro, 0.05),
+        "jerk_mean": add_noise(profile.jerk_mean, 0.02),
+        "jerk_std": add_noise(profile.jerk_std, 0.01),
+        "stability_index": bounded(add_noise(profile.stability_index, 0.03), 0.4, 0.99),
+        "rms_base": profile.rms_base,
+        "freq_base": profile.freq_base,
+        "user_emb": profile.user_emb,
+        "fatigue_prev": round(prev_fatigue, 4),
+        "fatigue": round(fatigue, 4),
+        "fatigue_level": 0 if fatigue < 0.3 else 1 if fatigue < 0.6 else 2,
+        "quality_flag": 1 if random.random() > 0.05 else 0,
+        "window_size_ms": 2000,
+        "overlap_rate": 0.5 + rand_float(0.05),
+    }
+    return record, fatigue
+def generate_dataset_dict() -> DatasetDict:
+    datasets_by_user: Dict[str, Dataset] = {}
+    start_time = datetime.utcnow()
+    for user_idx in range(1, TOTAL_USERS + 1):
+        profile = generate_user_profile(user_idx, start_time)
+        rows = []
+        prev_fatigue = profile.fatigue_base
+        for record_idx in range(RECORDS_PER_USER):
+            record, prev_fatigue = random_record(profile, record_idx, prev_fatigue)
+            rows.append(record)
+        df = pd.DataFrame(rows)
+        datasets_by_user[profile.user_id] = Dataset.from_pandas(df, preserve_index=False)
+    return DatasetDict(datasets_by_user)
+def main():
+    load_dotenv()
+    repo_id = require_env("HF_DATA_REPO_ID")
+    token = require_env("HF_DATA_TOKEN")
+    print(f"📦 Generating synthetic dataset: users={TOTAL_USERS}, records/user={RECORDS_PER_USER}")
+    dataset_dict = generate_dataset_dict()
+    total_records = sum(len(dataset_dict[user_id]) for user_id in dataset_dict)
+    print(f"🔢 Total records: {total_records}")
+    print(f"📤 Pushing DatasetDict ({len(dataset_dict)} users) to Hugging Face: {repo_id}")
+    dataset_dict.push_to_hub(repo_id, token=token, private=True)
+    print("✅ Upload complete")
+if __name__ == "__main__":
+    main()