Spaces:

Merry99
/

MuscleCare-FastAPI

Sleeping

App Files Files Community

Merry99 commited on Nov 22, 2025

Commit

500a872

1 Parent(s): e0a5f34

Update augment_dataset.py: Generate 20 new users with 500 records each, compatible with dataset commit fa41e8b

Browse files

Files changed (1) hide show

augment_dataset.py +384 -0

augment_dataset.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import os
+import random
+import json
+from datetime import datetime, timezone, timedelta
+from typing import Dict, List, Optional
+import pandas as pd
+import numpy as np
+from datasets import Dataset, DatasetDict, load_dataset
+from huggingface_hub import HfApi
+from dotenv import load_dotenv
+TARGET_USERS = 20
+RECORDS_PER_USER = 500
+def require_env(var_name: str) -> str:
+    value = os.getenv(var_name)
+    if not value:
+        raise RuntimeError(f"환경변수 {var_name}가 필요합니다.")
+    return value
+def add_noise(value: float, noise_scale: float) -> float:
+    """값에 노이즈 추가"""
+    if value is None:
+        return None
+    return round(value + random.uniform(-noise_scale, noise_scale), 4)
+def bounded(value: float, low: float, high: float) -> float:
+    """값을 범위 내로 제한"""
+    if value is None:
+        return None
+    return max(low, min(high, value))
+def augment_record(original: dict, noise_scale: float = 0.1) -> dict:
+    """단일 레코드를 증폭 (물리적 관계와 상관관계를 고려한 의미있는 증폭)"""
+    augmented = original.copy()
+    # 시간 정보 변형 (연속성 유지)
+    if "timestamp_utc" in augmented and augmented["timestamp_utc"]:
+        try:
+            base_time = datetime.fromisoformat(augmented["timestamp_utc"].replace("Z", "+00:00"))
+            time_delta = timedelta(milliseconds=random.randint(-200, 200))
+            augmented["timestamp_utc"] = (base_time + time_delta).isoformat()
+        except:
+            pass
+    # window_id와 시간 범위 약간 조정 (연속성 유지)
+    if "window_id" in augmented:
+        augmented["window_id"] = augmented["window_id"] + random.randint(-1, 1)
+    if "window_start_ms" in augmented:
+        augmented["window_start_ms"] = augmented["window_start_ms"] + random.randint(-50, 50)
+    if "window_end_ms" in augmented:
+        augmented["window_end_ms"] = augmented["window_start_ms"] + 2000  # window_size_ms와 일치
+    # 가속도계 데이터 증폭 (x, y, z 간 상관관계 유지)
+    acc_noise = random.uniform(-noise_scale * 0.1, noise_scale * 0.1)
+    if "acc_x_mean" in augmented and augmented["acc_x_mean"] is not None:
+        augmented["acc_x_mean"] = add_noise(augmented["acc_x_mean"], abs(augmented["acc_x_mean"]) * 0.1 + 0.01)
+    if "acc_y_mean" in augmented and augmented["acc_y_mean"] is not None:
+        augmented["acc_y_mean"] = add_noise(augmented["acc_y_mean"], abs(augmented["acc_y_mean"]) * 0.1 + 0.01)
+    if "acc_z_mean" in augmented and augmented["acc_z_mean"] is not None:
+        augmented["acc_z_mean"] = add_noise(augmented["acc_z_mean"], abs(augmented["acc_z_mean"]) * 0.1 + 0.01)
+    # 자이로스코프 데이터 증폭
+    gyro_noise = random.uniform(-noise_scale * 0.02, noise_scale * 0.02)
+    if "gyro_x_mean" in augmented and augmented["gyro_x_mean"] is not None:
+        augmented["gyro_x_mean"] = add_noise(augmented["gyro_x_mean"], 0.005)
+    if "gyro_y_mean" in augmented and augmented["gyro_y_mean"] is not None:
+        augmented["gyro_y_mean"] = add_noise(augmented["gyro_y_mean"], 0.005)
+    if "gyro_z_mean" in augmented and augmented["gyro_z_mean"] is not None:
+        augmented["gyro_z_mean"] = add_noise(augmented["gyro_z_mean"], 0.005)
+    # 선형 가속도 증폭
+    if "linacc_x_mean" in augmented and augmented["linacc_x_mean"] is not None:
+        augmented["linacc_x_mean"] = add_noise(augmented["linacc_x_mean"], abs(augmented["linacc_x_mean"]) * 0.1 + 0.01)
+    if "linacc_y_mean" in augmented and augmented["linacc_y_mean"] is not None:
+        augmented["linacc_y_mean"] = add_noise(augmented["linacc_y_mean"], abs(augmented["linacc_y_mean"]) * 0.1 + 0.01)
+    if "linacc_z_mean" in augmented and augmented["linacc_z_mean"] is not None:
+        augmented["linacc_z_mean"] = add_noise(augmented["linacc_z_mean"], abs(augmented["linacc_z_mean"]) * 0.1 + 0.01)
+    # 중력 벡터 증폭 (물리적 제약: 크기가 약 9.8에 가까워야 함)
+    if all(f in augmented and augmented[f] is not None for f in ["gravity_x_mean", "gravity_y_mean", "gravity_z_mean"]):
+        gx = augmented["gravity_x_mean"] + random.uniform(-0.01, 0.01)
+        gy = augmented["gravity_y_mean"] + random.uniform(-0.01, 0.01)
+        gz = augmented["gravity_z_mean"] + random.uniform(-0.02, 0.02)
+        # 중력 벡터 크기 정규화 (약 9.8 유지)
+        g_mag = np.sqrt(gx**2 + gy**2 + gz**2)
+        if g_mag > 0:
+            scale = 9.8 / g_mag
+            augmented["gravity_x_mean"] = round(gx * scale, 4)
+            augmented["gravity_y_mean"] = round(gy * scale, 4)
+            augmented["gravity_z_mean"] = round(gz * scale, 4)
+    # 센서 표준편차 증폭 (RMS와 일관성 유지)
+    sensor_std_fields = [
+        "acc_x_std", "acc_y_std", "acc_z_std",
+        "gyro_x_std", "gyro_y_std", "gyro_z_std",
+    ]
+    for field in sensor_std_fields:
+        if field in augmented and augmented[field] is not None:
+            augmented[field] = bounded(add_noise(augmented[field], augmented[field] * 0.1), 0.01, 1.0)
+    # RMS 값 증폭 (센서 평균값과 일관성 유지)
+    if "rms_acc" in augmented and augmented["rms_acc"] is not None:
+        # RMS는 가속도 평균값의 크기와 관련
+        acc_mag = np.sqrt(
+            (augmented.get("acc_x_mean", 0) or 0)**2 +
+            (augmented.get("acc_y_mean", 0) or 0)**2 +
+            (augmented.get("acc_z_mean", 0) or 0)**2
+        )
+        rms_base = augmented["rms_acc"]
+        # RMS는 원본과 비슷한 범위 유지
+        augmented["rms_acc"] = bounded(add_noise(rms_base, rms_base * 0.1), 0.1, 2.0)
+    if "rms_gyro" in augmented and augmented["rms_gyro"] is not None:
+        gyro_mag = np.sqrt(
+            (augmented.get("gyro_x_mean", 0) or 0)**2 +
+            (augmented.get("gyro_y_mean", 0) or 0)**2 +
+            (augmented.get("gyro_z_mean", 0) or 0)**2
+        )
+        rms_gyro_base = augmented["rms_gyro"]
+        augmented["rms_gyro"] = bounded(add_noise(rms_gyro_base, rms_gyro_base * 0.1), 0.01, 0.5)
+    # 주파수 증폭 (RMS와 상관관계 유지)
+    if "mean_freq_acc" in augmented and augmented["mean_freq_acc"] is not None:
+        # RMS가 높으면 주파수도 약간 높아지는 경향
+        freq_factor = 1.0 + (augmented.get("rms_acc", 0) or 0) * 0.1
+        augmented["mean_freq_acc"] = round(add_noise(augmented["mean_freq_acc"] * freq_factor, 1.0) / freq_factor, 2)
+    if "mean_freq_gyro" in augmented and augmented["mean_freq_gyro"] is not None:
+        freq_factor = 1.0 + (augmented.get("rms_gyro", 0) or 0) * 0.2
+        augmented["mean_freq_gyro"] = round(add_noise(augmented["mean_freq_gyro"] * freq_factor, 0.5) / freq_factor, 2)
+    # 엔트로피 증폭 (안정성과 관련)
+    if "entropy_acc" in augmented and augmented["entropy_acc"] is not None:
+        augmented["entropy_acc"] = bounded(add_noise(augmented["entropy_acc"], 0.02), 0.1, 1.0)
+    if "entropy_gyro" in augmented and augmented["entropy_gyro"] is not None:
+        augmented["entropy_gyro"] = bounded(add_noise(augmented["entropy_gyro"], 0.02), 0.1, 1.0)
+    # Jerk 증폭 (가속도 변화율)
+    if "jerk_mean" in augmented and augmented["jerk_mean"] is not None:
+        augmented["jerk_mean"] = add_noise(augmented["jerk_mean"], 0.01)
+    if "jerk_std" in augmented and augmented["jerk_std"] is not None:
+        augmented["jerk_std"] = bounded(add_noise(augmented["jerk_std"], 0.005), 0.01, 0.2)
+    # 안정성 지수 증폭 (엔트로피와 반비례 관계)
+    if "stability_index" in augmented and augmented["stability_index"] is not None:
+        # 엔트로피가 높으면 안정성이 낮아짐
+        entropy_avg = ((augmented.get("entropy_acc", 0.5) or 0.5) + (augmented.get("entropy_gyro", 0.5) or 0.5)) / 2
+        stability_base = 1.0 - entropy_avg * 0.3  # 엔트로피 기반 추정
+        augmented["stability_index"] = bounded(add_noise(stability_base, 0.02), 0.4, 0.99)
+    # 피로도 증폭 (RMS, 주파수와 상관관계)
+    if "fatigue" in augmented and augmented["fatigue"] is not None:
+        # RMS가 높고 주파수가 낮으면 피로도 증가
+        rms_factor = (augmented.get("rms_acc", 0) or 0) / (augmented.get("rms_base", 1.0) or 1.0)
+        freq_factor = (augmented.get("mean_freq_acc", 40) or 40) / (augmented.get("freq_base", 40) or 40)
+        fatigue_delta = (rms_factor - 1.0) * 0.05 - (freq_factor - 1.0) * 0.03 + random.uniform(-0.03, 0.03)
+        augmented["fatigue"] = bounded(augmented["fatigue"] + fatigue_delta, 0.05, 0.95)
+        augmented["fatigue_level"] = 0 if augmented["fatigue"] < 0.3 else 1 if augmented["fatigue"] < 0.6 else 2
+    # 이전 피로도는 현재 피로도와 연속성 유지
+    if "fatigue_prev" in augmented and augmented["fatigue_prev"] is not None:
+        if "fatigue" in augmented and augmented["fatigue"] is not None:
+            # 이전 피로도는 현재 피로도보다 약간 낮거나 비슷
+            augmented["fatigue_prev"] = bounded(augmented["fatigue"] - random.uniform(0, 0.1), 0.05, 0.95)
+        else:
+            augmented["fatigue_prev"] = bounded(add_noise(augmented["fatigue_prev"], 0.02), 0.05, 0.95)
+    # user_emb 벡터에 작은 노이즈 추가
+    if "user_emb" in augmented and augmented["user_emb"] is not None:
+        if isinstance(augmented["user_emb"], str):
+            try:
+                emb_list = json.loads(augmented["user_emb"])
+            except:
+                emb_list = augmented["user_emb"]
+        else:
+            emb_list = augmented["user_emb"]
+        if isinstance(emb_list, list) and len(emb_list) > 0:
+            augmented["user_emb"] = [round(v + random.uniform(-0.01, 0.01), 4) for v in emb_list]
+    # overlap_rate 약간 변형
+    if "overlap_rate" in augmented and augmented["overlap_rate"] is not None:
+        augmented["overlap_rate"] = bounded(add_noise(augmented["overlap_rate"], 0.02), 0.3, 0.7)
+    # quality_flag는 가끔 변경
+    if "quality_flag" in augmented:
+        if random.random() < 0.05:  # 5% 확률로 변경
+            augmented["quality_flag"] = 0 if augmented["quality_flag"] == 1 else 1
+    # session_id 약간 변형
+    if "session_id" in augmented and augmented["session_id"]:
+        parts = augmented["session_id"].split("_")
+        if len(parts) > 1:
+            try:
+                session_num = int(parts[-1])
+                augmented["session_id"] = "_".join(parts[:-1]) + "_" + str(session_num + random.randint(-5, 5))
+            except:
+                pass
+    return augmented
+def augment_user_data(df: pd.DataFrame, target_count: int) -> pd.DataFrame:
+    """사용자별 데이터를 증폭하여 목표 개수만큼 생성"""
+    current_count = len(df)
+    if current_count == 0:
+        return df
+    if current_count >= target_count:
+        # 이미 충분하면 그대로 반환
+        return df.head(target_count)
+    # 증폭이 필요한 개수
+    needed = target_count - current_count
+    # 기존 데이터를 복제하고 증폭
+    augmented_records = []
+    for _ in range(needed):
+        # 랜덤하게 원본 레코드 선택
+        original_idx = random.randint(0, current_count - 1)
+        original = df.iloc[original_idx].to_dict()
+        # 증폭 (노이즈 스케일은 필드에 따라 다르게)
+        noise_scale = random.uniform(0.05, 0.15)
+        augmented = augment_record(original, noise_scale)
+        augmented_records.append(augmented)
+    # 증폭된 데이터를 DataFrame으로 변환
+    augmented_df = pd.DataFrame(augmented_records)
+    # 원본과 병합
+    result_df = pd.concat([df, augmented_df], ignore_index=True)
+    return result_df
+def main():
+    load_dotenv()
+    repo_id = require_env("HF_DATA_REPO_ID")
+    token = require_env("HF_DATA_TOKEN")
+    print(f"📂 기존 데이터셋 로드 중: {repo_id}")
+    # 개별 parquet 파일을 모두 로드 (user로 시작하지 않는 파일도 포함)
+    api = HfApi()
+    try:
+        files = api.list_repo_files(repo_id=repo_id, repo_type="dataset", token=token)
+        # 모든 parquet 파일 필터링 (user로 시작하지 않는 것도 포함)
+        parquet_files = [f for f in files if f.endswith(".parquet")]
+        print(f"📊 Parquet 파일 수: {len(parquet_files)}")
+        existing = DatasetDict()
+        for file_path in parquet_files:
+                try:
+                    # 파일명에서 사용자 ID 추출
+                    # 형식: data/user_xxx.parquet 또는 data/user_xxx-00000-of-00001.parquet
+                    filename = file_path.split("/")[-1] if "/" in file_path else file_path
+                    # .parquet 확장자 제거
+                    filename_no_ext = filename.replace(".parquet", "")
+                    # -00000-of-00001 부분이 있으면 제거, 없으면 그대로 사용
+                    if "-" in filename_no_ext:
+                        user_id = filename_no_ext.split("-")[0]
+                    else:
+                        user_id = filename_no_ext
+                    # 개별 파일을 pandas로 직접 로드
+                    from huggingface_hub import hf_hub_download
+                    import tempfile
+                    # 파일 다운로드
+                    local_path = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=file_path,
+                        repo_type="dataset",
+                        token=token
+                    )
+                    # pandas로 직접 읽기
+                    df = pd.read_parquet(local_path)
+                    if len(df) > 0:
+                        existing[user_id] = Dataset.from_pandas(df, preserve_index=False)
+                        print(f"✅ {user_id}: {len(df)} 레코드 로드")
+                    else:
+                        print(f"⚠️ {user_id}: 빈 데이터셋, 건너뜀")
+                except Exception as e2:
+                    print(f"⚠️ {file_path}: 로드 실패 ({str(e2)[:100]}), 건너뜀")
+                    continue
+    except Exception as e3:
+        print(f"❌ 데이터셋 로드 완전 실패: {e3}")
+        return
+    # 유효한 사용자만 필터링 (데이터가 있는 사용자만)
+    valid_users = {}
+    for user_id in existing.keys():
+        try:
+            user_data = existing[user_id]
+            if len(user_data) > 0:
+                valid_users[user_id] = user_data
+            else:
+                print(f"⚠️ {user_id}: 빈 데이터셋, 건너뜀")
+        except Exception as e:
+            print(f"⚠️ {user_id}: 데이터 접근 실패 ({e}), 건너뜀")
+            continue
+    if len(valid_users) == 0:
+        print("❌ 유효한 사용자 데이터가 없습니다.")
+        return
+    print(f"✅ 유효한 사용자 수: {len(valid_users)}명")
+    # 현재 총 레코��� 수 계산
+    current_total = sum(len(valid_users[user_id]) for user_id in valid_users)
+    print(f"📊 현재 총 레코드 수: {current_total}")
+    # 기존 사용자 목록 가져오기 (샘플링용)
+    all_users = list(valid_users.keys())
+    if len(all_users) == 0:
+        print("❌ 증폭할 참조 데이터가 없습니다.")
+        return
+    # 새로운 사용자 20명 생성 (기존 사용자 데이터를 참조하여 증폭)
+    print(f"🎯 새로운 사용자 {TARGET_USERS}명 생성 중...")
+    print(f"📋 참조 사용자: {len(all_users)}명")
+    print(f"🎯 사용자당 목표 레코드 수: {RECORDS_PER_USER}")
+    # 새로운 사용자 데이터셋 생성
+    new_user_datasets = {}
+    for i in range(1, TARGET_USERS + 1):
+        # 새로운 사용자 ID 생성
+        new_user_id = f"augmented_user_{i:03d}"
+        # 기존 사용자 중 랜덤 선택 (참조용)
+        reference_user_id = random.choice(all_users)
+        reference_df = valid_users[reference_user_id].to_pandas()
+        if len(reference_df) == 0:
+            print(f"⚠️ 참조 사용자 {reference_user_id}의 데이터가 비어있어 건너뜀")
+            continue
+        try:
+            # 참조 데이터를 증폭하여 새로운 사용자 데이터 생성
+            new_user_df = augment_user_data(reference_df, RECORDS_PER_USER)
+            new_user_datasets[new_user_id] = Dataset.from_pandas(new_user_df, preserve_index=False)
+            print(f"📈 {new_user_id}: {RECORDS_PER_USER} 레코드 생성 (참조: {reference_user_id})")
+        except Exception as e:
+            print(f"❌ {new_user_id}: 생성 실패 ({e}), 건너뜀")
+            continue
+    if len(new_user_datasets) == 0:
+        print("❌ 새로운 사용자 데이터가 생성되지 않았습니다.")
+        return
+    # 기존 데이터셋에 새로운 사용자 데이터 추가
+    final_datasets = {}
+    # 기존 사용자 데이터 유지
+    for user_id in valid_users.keys():
+        final_datasets[user_id] = valid_users[user_id]
+    # 새로운 사용자 데이터 추가
+    for user_id in new_user_datasets.keys():
+        final_datasets[user_id] = new_user_datasets[user_id]
+    final_dict = DatasetDict(final_datasets)
+    new_users_total = sum(len(new_user_datasets[user_id]) for user_id in new_user_datasets)
+    total_records = sum(len(final_dict[user_id]) for user_id in final_dict)
+    print(f"📊 새로운 사용자들의 총 레코드 수: {new_users_total}")
+    print(f"📊 전체 데이터셋 총 레코드 수: {total_records}")
+    print(f"📊 새로운 parquet 파일 수: {len(new_user_datasets)}개")
+    print(f"📤 Hugging Face Hub에 업로드 중: {repo_id}")
+    final_dict.push_to_hub(repo_id, token=token, private=True)
+    print("✅ 업로드 완료")
+if __name__ == "__main__":
+    main()