MuscleCare-FastAPI / generate_dataset.py
Merry99's picture
add generate_dataset
a9f6dc6
import os
import random
from dataclasses import dataclass
from datetime import datetime, timezone, timedelta
from typing import Dict, List, Tuple
import pandas as pd
from datasets import Dataset, DatasetDict
from dotenv import load_dotenv
TOTAL_USERS = 50
RECORDS_PER_USER = 50
USER_EMB_DIM = 12
@dataclass
class UserProfile:
user_id: str
session_prefix: str
base_time: datetime
acc_mean: Tuple[float, float, float]
gyro_mean: Tuple[float, float, float]
linacc_mean: Tuple[float, float, float]
gravity_mean: Tuple[float, float, float]
acc_std: Tuple[float, float, float]
gyro_std: Tuple[float, float, float]
rms_base: float
rms_gyro_base: float
mean_freq_acc: float
mean_freq_gyro: float
entropy_acc: float
entropy_gyro: float
jerk_mean: float
jerk_std: float
stability_index: float
freq_base: float
user_emb: List[float]
fatigue_base: float
def require_env(var_name: str) -> str:
value = os.getenv(var_name)
if not value:
raise RuntimeError(f"ν™˜κ²½λ³€μˆ˜ {var_name}κ°€ ν•„μš”ν•©λ‹ˆλ‹€.")
return value
def random_vector(dim: int, scale: float = 1.0) -> List[float]:
return [round(random.uniform(-scale, scale), 4) for _ in range(dim)]
def generate_user_profile(user_idx: int, start_time: datetime) -> UserProfile:
user_id = f"user_{user_idx:03d}"
session_prefix = f"{user_id}_session"
def triple(base_scale: float) -> Tuple[float, float, float]:
return tuple(round(random.uniform(-base_scale, base_scale), 4) for _ in range(3))
def positive_triple(low: float, high: float) -> Tuple[float, float, float]:
return tuple(round(random.uniform(low, high), 4) for _ in range(3))
profile = UserProfile(
user_id=user_id,
session_prefix=session_prefix,
base_time=start_time + timedelta(minutes=random.uniform(0, 5)),
acc_mean=triple(0.2),
gyro_mean=triple(0.05),
linacc_mean=triple(0.3),
gravity_mean=(round(random.uniform(-0.05, 0.05), 4),
round(random.uniform(-0.05, 0.05), 4),
round(random.uniform(0.9, 1.1), 4)),
acc_std=positive_triple(0.2, 0.6),
gyro_std=positive_triple(0.02, 0.08),
rms_base=round(random.uniform(0.3, 1.0), 4),
rms_gyro_base=round(random.uniform(0.05, 0.2), 4),
mean_freq_acc=round(random.uniform(25, 55), 2),
mean_freq_gyro=round(random.uniform(10, 25), 2),
entropy_acc=round(random.uniform(0.3, 0.8), 4),
entropy_gyro=round(random.uniform(0.3, 0.7), 4),
jerk_mean=round(random.uniform(-0.2, 0.2), 4),
jerk_std=round(random.uniform(0.02, 0.08), 4),
stability_index=round(random.uniform(0.6, 0.95), 4),
freq_base=round(random.uniform(30, 55), 2),
user_emb=random_vector(USER_EMB_DIM, scale=0.5),
fatigue_base=round(random.uniform(0.25, 0.6), 4),
)
return profile
def add_noise(value: float, noise_scale: float) -> float:
return round(value + random.uniform(-noise_scale, noise_scale), 4)
def bounded(value: float, low: float, high: float) -> float:
return max(low, min(high, value))
def random_record(
profile: UserProfile,
record_idx: int,
prev_fatigue: float,
) -> Tuple[dict, float]:
window_start_ms = record_idx * 2000
window_end_ms = window_start_ms + 2000
base_time = profile.base_time + timedelta(milliseconds=window_start_ms)
def rand_float(scale: float = 1.0) -> float:
return round(random.uniform(-scale, scale), 4)
fatigue_delta = random.uniform(-0.05, 0.1)
fatigue = bounded(prev_fatigue + fatigue_delta, 0.05, 0.95)
record = {
"user_id": profile.user_id,
"session_id": f"{profile.session_prefix}_{record_idx:03d}",
"window_id": record_idx,
"window_start_ms": window_start_ms,
"window_end_ms": window_end_ms,
"timestamp_utc": base_time.replace(tzinfo=timezone.utc).isoformat(),
"acc_x_mean": add_noise(profile.acc_mean[0], 0.05),
"acc_y_mean": add_noise(profile.acc_mean[1], 0.05),
"acc_z_mean": add_noise(profile.acc_mean[2], 0.05),
"gyro_x_mean": add_noise(profile.gyro_mean[0], 0.01),
"gyro_y_mean": add_noise(profile.gyro_mean[1], 0.01),
"gyro_z_mean": add_noise(profile.gyro_mean[2], 0.01),
"linacc_x_mean": add_noise(profile.linacc_mean[0], 0.07),
"linacc_y_mean": add_noise(profile.linacc_mean[1], 0.07),
"linacc_z_mean": add_noise(profile.linacc_mean[2], 0.07),
"gravity_x_mean": add_noise(profile.gravity_mean[0], 0.005),
"gravity_y_mean": add_noise(profile.gravity_mean[1], 0.005),
"gravity_z_mean": add_noise(profile.gravity_mean[2], 0.02),
"acc_x_std": add_noise(profile.acc_std[0], 0.05),
"acc_y_std": add_noise(profile.acc_std[1], 0.05),
"acc_z_std": add_noise(profile.acc_std[2], 0.05),
"gyro_x_std": add_noise(profile.gyro_std[0], 0.005),
"gyro_y_std": add_noise(profile.gyro_std[1], 0.005),
"gyro_z_std": add_noise(profile.gyro_std[2], 0.005),
"rms_acc": add_noise(profile.rms_base, 0.1),
"rms_gyro": add_noise(profile.rms_gyro_base, 0.02),
"mean_freq_acc": round(add_noise(profile.mean_freq_acc, 1.5), 2),
"mean_freq_gyro": round(add_noise(profile.mean_freq_gyro, 0.8), 2),
"entropy_acc": add_noise(profile.entropy_acc, 0.05),
"entropy_gyro": add_noise(profile.entropy_gyro, 0.05),
"jerk_mean": add_noise(profile.jerk_mean, 0.02),
"jerk_std": add_noise(profile.jerk_std, 0.01),
"stability_index": bounded(add_noise(profile.stability_index, 0.03), 0.4, 0.99),
"rms_base": profile.rms_base,
"freq_base": profile.freq_base,
"user_emb": profile.user_emb,
"fatigue_prev": round(prev_fatigue, 4),
"fatigue": round(fatigue, 4),
"fatigue_level": 0 if fatigue < 0.3 else 1 if fatigue < 0.6 else 2,
"quality_flag": 1 if random.random() > 0.05 else 0,
"window_size_ms": 2000,
"overlap_rate": 0.5 + rand_float(0.05),
}
return record, fatigue
def generate_dataset_dict() -> DatasetDict:
datasets_by_user: Dict[str, Dataset] = {}
start_time = datetime.utcnow()
for user_idx in range(1, TOTAL_USERS + 1):
profile = generate_user_profile(user_idx, start_time)
rows = []
prev_fatigue = profile.fatigue_base
for record_idx in range(RECORDS_PER_USER):
record, prev_fatigue = random_record(profile, record_idx, prev_fatigue)
rows.append(record)
df = pd.DataFrame(rows)
datasets_by_user[profile.user_id] = Dataset.from_pandas(df, preserve_index=False)
return DatasetDict(datasets_by_user)
def main():
load_dotenv()
repo_id = require_env("HF_DATA_REPO_ID")
token = require_env("HF_DATA_TOKEN")
print(f"πŸ“¦ Generating synthetic dataset: users={TOTAL_USERS}, records/user={RECORDS_PER_USER}")
dataset_dict = generate_dataset_dict()
total_records = sum(len(dataset_dict[user_id]) for user_id in dataset_dict)
print(f"πŸ”’ Total records: {total_records}")
print(f"πŸ“€ Pushing DatasetDict ({len(dataset_dict)} users) to Hugging Face: {repo_id}")
dataset_dict.push_to_hub(repo_id, token=token, private=True)
print("βœ… Upload complete")
if __name__ == "__main__":
main()