Spaces:

Merry99
/

MuscleCare-FastAPI

Sleeping

App Files Files Community

Merry99 commited on Nov 22, 2025

Commit

e2b68a6

1 Parent(s): 500a872

add augment py

Browse files

Files changed (1) hide show

augment_dataset.py +416 -183

augment_dataset.py CHANGED Viewed

@@ -22,222 +22,376 @@ def require_env(var_name: str) -> str:
     return value
-def add_noise(value: float, noise_scale: float) -> float:
-    """값에 노이즈 추가"""
     if value is None:
         return None
-    return round(value + random.uniform(-noise_scale, noise_scale), 4)
-def bounded(value: float, low: float, high: float) -> float:
-    """값을 범위 내로 제한"""
     if value is None:
         return None
-    return max(low, min(high, value))
-def augment_record(original: dict, noise_scale: float = 0.1) -> dict:
-    """단일 레코드를 증폭 (물리적 관계와 상관관계를 고려한 의미있는 증폭)"""
-    augmented = original.copy()
-    # 시간 정보 변형 (연속성 유지)
-    if "timestamp_utc" in augmented and augmented["timestamp_utc"]:
         try:
-            base_time = datetime.fromisoformat(augmented["timestamp_utc"].replace("Z", "+00:00"))
-            time_delta = timedelta(milliseconds=random.randint(-200, 200))
-            augmented["timestamp_utc"] = (base_time + time_delta).isoformat()
         except:
             pass
-    # window_id와 시간 범위 약간 조정 (연속성 유지)
-    if "window_id" in augmented:
-        augmented["window_id"] = augmented["window_id"] + random.randint(-1, 1)
-    if "window_start_ms" in augmented:
-        augmented["window_start_ms"] = augmented["window_start_ms"] + random.randint(-50, 50)
-    if "window_end_ms" in augmented:
-        augmented["window_end_ms"] = augmented["window_start_ms"] + 2000  # window_size_ms와 일치
-    # 가속도계 데이터 증폭 (x, y, z 간 상관관계 유지)
-    acc_noise = random.uniform(-noise_scale * 0.1, noise_scale * 0.1)
-    if "acc_x_mean" in augmented and augmented["acc_x_mean"] is not None:
-        augmented["acc_x_mean"] = add_noise(augmented["acc_x_mean"], abs(augmented["acc_x_mean"]) * 0.1 + 0.01)
-    if "acc_y_mean" in augmented and augmented["acc_y_mean"] is not None:
-        augmented["acc_y_mean"] = add_noise(augmented["acc_y_mean"], abs(augmented["acc_y_mean"]) * 0.1 + 0.01)
-    if "acc_z_mean" in augmented and augmented["acc_z_mean"] is not None:
-        augmented["acc_z_mean"] = add_noise(augmented["acc_z_mean"], abs(augmented["acc_z_mean"]) * 0.1 + 0.01)
-    # 자이로스코프 데이터 증폭
-    gyro_noise = random.uniform(-noise_scale * 0.02, noise_scale * 0.02)
-    if "gyro_x_mean" in augmented and augmented["gyro_x_mean"] is not None:
-        augmented["gyro_x_mean"] = add_noise(augmented["gyro_x_mean"], 0.005)
-    if "gyro_y_mean" in augmented and augmented["gyro_y_mean"] is not None:
-        augmented["gyro_y_mean"] = add_noise(augmented["gyro_y_mean"], 0.005)
-    if "gyro_z_mean" in augmented and augmented["gyro_z_mean"] is not None:
-        augmented["gyro_z_mean"] = add_noise(augmented["gyro_z_mean"], 0.005)
-    # 선형 가속도 증폭
-    if "linacc_x_mean" in augmented and augmented["linacc_x_mean"] is not None:
-        augmented["linacc_x_mean"] = add_noise(augmented["linacc_x_mean"], abs(augmented["linacc_x_mean"]) * 0.1 + 0.01)
-    if "linacc_y_mean" in augmented and augmented["linacc_y_mean"] is not None:
-        augmented["linacc_y_mean"] = add_noise(augmented["linacc_y_mean"], abs(augmented["linacc_y_mean"]) * 0.1 + 0.01)
-    if "linacc_z_mean" in augmented and augmented["linacc_z_mean"] is not None:
-        augmented["linacc_z_mean"] = add_noise(augmented["linacc_z_mean"], abs(augmented["linacc_z_mean"]) * 0.1 + 0.01)
-    # 중력 벡터 증폭 (물리적 제약: 크기가 약 9.8에 가까워야 함)
-    if all(f in augmented and augmented[f] is not None for f in ["gravity_x_mean", "gravity_y_mean", "gravity_z_mean"]):
-        gx = augmented["gravity_x_mean"] + random.uniform(-0.01, 0.01)
-        gy = augmented["gravity_y_mean"] + random.uniform(-0.01, 0.01)
-        gz = augmented["gravity_z_mean"] + random.uniform(-0.02, 0.02)
-        # 중력 벡터 크기 정규화 (약 9.8 유지)
         g_mag = np.sqrt(gx**2 + gy**2 + gz**2)
         if g_mag > 0:
             scale = 9.8 / g_mag
-            augmented["gravity_x_mean"] = round(gx * scale, 4)
-            augmented["gravity_y_mean"] = round(gy * scale, 4)
-            augmented["gravity_z_mean"] = round(gz * scale, 4)
-    # 센서 표준편차 증폭 (RMS와 일관성 유지)
-    sensor_std_fields = [
-        "acc_x_std", "acc_y_std", "acc_z_std",
-        "gyro_x_std", "gyro_y_std", "gyro_z_std",
-    ]
-    for field in sensor_std_fields:
-        if field in augmented and augmented[field] is not None:
-            augmented[field] = bounded(add_noise(augmented[field], augmented[field] * 0.1), 0.01, 1.0)
-    # RMS 값 증폭 (센서 평균값과 일관성 유지)
-    if "rms_acc" in augmented and augmented["rms_acc"] is not None:
-        # RMS는 가속도 평균값의 크기와 관련
-        acc_mag = np.sqrt(
-            (augmented.get("acc_x_mean", 0) or 0)**2 +
-            (augmented.get("acc_y_mean", 0) or 0)**2 +
-            (augmented.get("acc_z_mean", 0) or 0)**2
         )
-        rms_base = augmented["rms_acc"]
-        # RMS는 원본과 비슷한 범위 유지
-        augmented["rms_acc"] = bounded(add_noise(rms_base, rms_base * 0.1), 0.1, 2.0)
-    if "rms_gyro" in augmented and augmented["rms_gyro"] is not None:
-        gyro_mag = np.sqrt(
-            (augmented.get("gyro_x_mean", 0) or 0)**2 +
-            (augmented.get("gyro_y_mean", 0) or 0)**2 +
-            (augmented.get("gyro_z_mean", 0) or 0)**2
         )
-        rms_gyro_base = augmented["rms_gyro"]
-        augmented["rms_gyro"] = bounded(add_noise(rms_gyro_base, rms_gyro_base * 0.1), 0.01, 0.5)
-    # 주파수 증폭 (RMS와 상관관계 유지)
-    if "mean_freq_acc" in augmented and augmented["mean_freq_acc"] is not None:
-        # RMS가 높으면 주파수도 약간 높아지는 경향
-        freq_factor = 1.0 + (augmented.get("rms_acc", 0) or 0) * 0.1
-        augmented["mean_freq_acc"] = round(add_noise(augmented["mean_freq_acc"] * freq_factor, 1.0) / freq_factor, 2)
-    if "mean_freq_gyro" in augmented and augmented["mean_freq_gyro"] is not None:
-        freq_factor = 1.0 + (augmented.get("rms_gyro", 0) or 0) * 0.2
-        augmented["mean_freq_gyro"] = round(add_noise(augmented["mean_freq_gyro"] * freq_factor, 0.5) / freq_factor, 2)
-    # 엔트로피 증폭 (안정성과 관련)
-    if "entropy_acc" in augmented and augmented["entropy_acc"] is not None:
-        augmented["entropy_acc"] = bounded(add_noise(augmented["entropy_acc"], 0.02), 0.1, 1.0)
-    if "entropy_gyro" in augmented and augmented["entropy_gyro"] is not None:
-        augmented["entropy_gyro"] = bounded(add_noise(augmented["entropy_gyro"], 0.02), 0.1, 1.0)
-    # Jerk 증폭 (가속도 변화율)
-    if "jerk_mean" in augmented and augmented["jerk_mean"] is not None:
-        augmented["jerk_mean"] = add_noise(augmented["jerk_mean"], 0.01)
-    if "jerk_std" in augmented and augmented["jerk_std"] is not None:
-        augmented["jerk_std"] = bounded(add_noise(augmented["jerk_std"], 0.005), 0.01, 0.2)
-    # 안정성 지수 증폭 (엔트로피와 반비례 관계)
-    if "stability_index" in augmented and augmented["stability_index"] is not None:
-        # 엔트로피가 높으면 안정성이 낮아짐
-        entropy_avg = ((augmented.get("entropy_acc", 0.5) or 0.5) + (augmented.get("entropy_gyro", 0.5) or 0.5)) / 2
-        stability_base = 1.0 - entropy_avg * 0.3  # 엔트로피 기반 추정
-        augmented["stability_index"] = bounded(add_noise(stability_base, 0.02), 0.4, 0.99)
-    # 피로도 증폭 (RMS, 주파수와 상관관계)
-    if "fatigue" in augmented and augmented["fatigue"] is not None:
-        # RMS가 높고 주파수가 낮으면 피로도 증가
-        rms_factor = (augmented.get("rms_acc", 0) or 0) / (augmented.get("rms_base", 1.0) or 1.0)
-        freq_factor = (augmented.get("mean_freq_acc", 40) or 40) / (augmented.get("freq_base", 40) or 40)
-        fatigue_delta = (rms_factor - 1.0) * 0.05 - (freq_factor - 1.0) * 0.03 + random.uniform(-0.03, 0.03)
-        augmented["fatigue"] = bounded(augmented["fatigue"] + fatigue_delta, 0.05, 0.95)
-        augmented["fatigue_level"] = 0 if augmented["fatigue"] < 0.3 else 1 if augmented["fatigue"] < 0.6 else 2
-    # 이전 피로도는 현재 피로도와 연속성 유지
-    if "fatigue_prev" in augmented and augmented["fatigue_prev"] is not None:
-        if "fatigue" in augmented and augmented["fatigue"] is not None:
-            # 이전 피로도는 현재 피로도보다 약간 낮거나 비슷
-            augmented["fatigue_prev"] = bounded(augmented["fatigue"] - random.uniform(0, 0.1), 0.05, 0.95)
         else:
-            augmented["fatigue_prev"] = bounded(add_noise(augmented["fatigue_prev"], 0.02), 0.05, 0.95)
-    # user_emb 벡터에 작은 노이즈 추가
-    if "user_emb" in augmented and augmented["user_emb"] is not None:
-        if isinstance(augmented["user_emb"], str):
-            try:
-                emb_list = json.loads(augmented["user_emb"])
-            except:
-                emb_list = augmented["user_emb"]
         else:
-            emb_list = augmented["user_emb"]
-        if isinstance(emb_list, list) and len(emb_list) > 0:
-            augmented["user_emb"] = [round(v + random.uniform(-0.01, 0.01), 4) for v in emb_list]
-    # overlap_rate 약간 변형
-    if "overlap_rate" in augmented and augmented["overlap_rate"] is not None:
-        augmented["overlap_rate"] = bounded(add_noise(augmented["overlap_rate"], 0.02), 0.3, 0.7)
-    # quality_flag는 가끔 변경
-    if "quality_flag" in augmented:
         if random.random() < 0.05:  # 5% 확률로 변경
-            augmented["quality_flag"] = 0 if augmented["quality_flag"] == 1 else 1
     # session_id 약간 변형
-    if "session_id" in augmented and augmented["session_id"]:
-        parts = augmented["session_id"].split("_")
         if len(parts) > 1:
             try:
                 session_num = int(parts[-1])
-                augmented["session_id"] = "_".join(parts[:-1]) + "_" + str(session_num + random.randint(-5, 5))
             except:
-                pass
-    return augmented
-def augment_user_data(df: pd.DataFrame, target_count: int) -> pd.DataFrame:
-    """사용자별 데이터를 증폭하여 목표 개수만큼 생성"""
-    current_count = len(df)
-    if current_count == 0:
-        return df
-    if current_count >= target_count:
-        # 이미 충분하면 그대로 반환
         return df.head(target_count)
-    # 증폭이 필요한 개수
-    needed = target_count - current_count
-    # 기존 데이터를 복제하고 증폭
-    augmented_records = []
-    for _ in range(needed):
-        # 랜덤하게 원본 레코드 선택
-        original_idx = random.randint(0, current_count - 1)
-        original = df.iloc[original_idx].to_dict()
-        # 증폭 (노이즈 스케일은 필드에 따라 다르게)
-        noise_scale = random.uniform(0.05, 0.15)
-        augmented = augment_record(original, noise_scale)
-        augmented_records.append(augmented)
-    # 증폭된 데이터를 DataFrame으로 변환
-    augmented_df = pd.DataFrame(augmented_records)
-    # 원본과 병합
-    result_df = pd.concat([df, augmented_df], ignore_index=True)
-    return result_df
 def main():
@@ -270,6 +424,11 @@ def main():
                     else:
                         user_id = filename_no_ext
                     # 개별 파일을 pandas로 직접 로드
                     from huggingface_hub import hf_hub_download
                     import tempfile
@@ -296,9 +455,13 @@ def main():
         print(f"❌ 데이터셋 로드 완전 실패: {e3}")
         return
-    # 유효한 사용자만 필터링 (데이터가 있는 사용자만)
     valid_users = {}
     for user_id in existing.keys():
         try:
             user_data = existing[user_id]
             if len(user_data) > 0:
@@ -346,10 +509,18 @@ def main():
             continue
         try:
-            # 참조 데이터를 증폭하여 새로운 사용자 데이터 생성
-            new_user_df = augment_user_data(reference_df, RECORDS_PER_USER)
             new_user_datasets[new_user_id] = Dataset.from_pandas(new_user_df, preserve_index=False)
-            print(f"📈 {new_user_id}: {RECORDS_PER_USER} 레코드 생성 (참조: {reference_user_id})")
         except Exception as e:
             print(f"❌ {new_user_id}: 생성 실패 ({e}), 건너뜀")
             continue
@@ -358,6 +529,38 @@ def main():
         print("❌ 새로운 사용자 데이터가 생성되지 않았습니다.")
         return
     # 기존 데이터셋에 새로운 사용자 데이터 추가
     final_datasets = {}
     # 기존 사용자 데이터 유지
@@ -374,6 +577,36 @@ def main():
     print(f"📊 전체 데이터셋 총 레코드 수: {total_records}")
     print(f"📊 새로운 parquet 파일 수: {len(new_user_datasets)}개")
     print(f"📤 Hugging Face Hub에 업로드 중: {repo_id}")
     final_dict.push_to_hub(repo_id, token=token, private=True)
     print("✅ 업로드 완료")

     return value
+def jitter(value: float, scale: float = 0.02) -> float:
+    """값에 ±scale 비율의 노이즈를 추가"""
     if value is None:
         return None
+    return value * (1 + random.uniform(-scale, scale))
+def jitter_abs(value: float, amount: float) -> float:
+    """절대값 기준 노이즈 추가"""
     if value is None:
         return None
+    return value + random.uniform(-amount, amount)
+def augment_sensor_vector(x: float, y: float, z: float, noise: float = 0.02) -> tuple:
+    """
+    3축 센서 데이터를 물리적으로 자연스럽게 증폭
+    → 3축은 동일한 비율로 scaling + 개별 작은 노이즈
+    """
+    if x is None or y is None or z is None:
+        return (x, y, z)
+    scale = 1 + random.uniform(-noise, noise)
+    return (
+        round(x * scale + random.uniform(-0.01, 0.01), 4),
+        round(y * scale + random.uniform(-0.01, 0.01), 4),
+        round(z * scale + random.uniform(-0.01, 0.01), 4),
+    )
+def compute_rms(x: float, y: float, z: float, base_noise: float = 0.02) -> float:
+    """3축 mean 기반으로 RMS 재계산"""
+    if x is None or y is None or z is None:
+        return None
+    base = np.sqrt(x**2 + y**2 + z**2)
+    return round(base * (1 + random.uniform(-base_noise, base_noise)), 4)
+def augment_record_strict(row: dict) -> dict:
+    """물리적 제약을 지키면서 센서 데이터를 자연스럽게 증폭"""
+    new = row.copy()
+    # timestamp jitter
+    if "timestamp_utc" in row and isinstance(row["timestamp_utc"], str):
         try:
+            t = datetime.fromisoformat(row["timestamp_utc"].replace("Z", "+00:00"))
+            t = t + timedelta(milliseconds=random.randint(-150, 150))
+            new["timestamp_utc"] = t.isoformat()
         except:
             pass
+    # window jitter
+    if "window_id" in row and row["window_id"] is not None:
+        new["window_id"] = int(row["window_id"] + random.randint(-1, 1))
+    if "window_start_ms" in row and row["window_start_ms"] is not None:
+        new["window_start_ms"] = row["window_start_ms"] + random.randint(-50, 50)
+    if "window_end_ms" in row and row["window_end_ms"] is not None:
+        new["window_end_ms"] = new["window_start_ms"] + 2000  # window_size_ms와 일치
+    # --- Accelerometer mean ---
+    if all(f in row and row[f] is not None for f in ["acc_x_mean", "acc_y_mean", "acc_z_mean"]):
+        new["acc_x_mean"], new["acc_y_mean"], new["acc_z_mean"] = augment_sensor_vector(
+            row["acc_x_mean"], row["acc_y_mean"], row["acc_z_mean"], noise=0.03
+        )
+    # --- Gyro mean ---
+    if all(f in row and row[f] is not None for f in ["gyro_x_mean", "gyro_y_mean", "gyro_z_mean"]):
+        new["gyro_x_mean"], new["gyro_y_mean"], new["gyro_z_mean"] = augment_sensor_vector(
+            row["gyro_x_mean"], row["gyro_y_mean"], row["gyro_z_mean"], noise=0.03
+        )
+    # --- Linear accel mean ---
+    if all(f in row and row[f] is not None for f in ["linacc_x_mean", "linacc_y_mean", "linacc_z_mean"]):
+        new["linacc_x_mean"], new["linacc_y_mean"], new["linacc_z_mean"] = augment_sensor_vector(
+            row["linacc_x_mean"], row["linacc_y_mean"], row["linacc_z_mean"], noise=0.03
+        )
+    # --- Gravity vector (물리적 제약: 크기가 약 9.8) ---
+    if all(f in row and row[f] is not None for f in ["gravity_x_mean", "gravity_y_mean", "gravity_z_mean"]):
+        gx, gy, gz = augment_sensor_vector(
+            row["gravity_x_mean"], row["gravity_y_mean"], row["gravity_z_mean"], noise=0.01
+        )
         g_mag = np.sqrt(gx**2 + gy**2 + gz**2)
         if g_mag > 0:
             scale = 9.8 / g_mag
+            new["gravity_x_mean"] = round(gx * scale, 4)
+            new["gravity_y_mean"] = round(gy * scale, 4)
+            new["gravity_z_mean"] = round(gz * scale, 4)
+    # --- Recompute RMS from sensor means ---
+    if all(f in new and new[f] is not None for f in ["acc_x_mean", "acc_y_mean", "acc_z_mean"]):
+        new["rms_acc"] = compute_rms(
+            new["acc_x_mean"], new["acc_y_mean"], new["acc_z_mean"], base_noise=0.03
         )
+    elif "rms_acc" in row and row["rms_acc"] is not None:
+        new["rms_acc"] = jitter(row["rms_acc"], 0.03)
+    if all(f in new and new[f] is not None for f in ["gyro_x_mean", "gyro_y_mean", "gyro_z_mean"]):
+        new["rms_gyro"] = compute_rms(
+            new["gyro_x_mean"], new["gyro_y_mean"], new["gyro_z_mean"], base_noise=0.03
         )
+    elif "rms_gyro" in row and row["rms_gyro"] is not None:
+        new["rms_gyro"] = jitter(row["rms_gyro"], 0.03)
+    # --- std values scale with RMS ---
+    if "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0:
+        rms_ratio = new["rms_acc"] / row["rms_acc"]
+        for col in ["acc_x_std", "acc_y_std", "acc_z_std"]:
+            if col in row and row[col] is not None:
+                new[col] = max(0.01, row[col] * rms_ratio * jitter(1, 0.1))
+    if "rms_gyro" in new and new["rms_gyro"] is not None and "rms_gyro" in row and row["rms_gyro"] is not None and row["rms_gyro"] > 0:
+        rms_ratio = new["rms_gyro"] / row["rms_gyro"]
+        for col in ["gyro_x_std", "gyro_y_std", "gyro_z_std"]:
+            if col in row and row[col] is not None:
+                new[col] = max(0.001, row[col] * rms_ratio * jitter(1, 0.1))
+    # --- frequency (weak positive correlation with RMS) ---
+    if "mean_freq_acc" in row and row["mean_freq_acc"] is not None and "rms_acc" in new and new["rms_acc"] is not None:
+        new["mean_freq_acc"] = round(jitter_abs(row["mean_freq_acc"], new["rms_acc"] * 0.3), 2)
+    elif "mean_freq_acc" in row and row["mean_freq_acc"] is not None:
+        new["mean_freq_acc"] = round(jitter(row["mean_freq_acc"], 0.02), 2)
+    if "mean_freq_gyro" in row and row["mean_freq_gyro"] is not None and "rms_gyro" in new and new["rms_gyro"] is not None:
+        new["mean_freq_gyro"] = round(jitter_abs(row["mean_freq_gyro"], new["rms_gyro"] * 0.3), 2)
+    elif "mean_freq_gyro" in row and row["mean_freq_gyro"] is not None:
+        new["mean_freq_gyro"] = round(jitter(row["mean_freq_gyro"], 0.02), 2)
+    # --- entropy: increases when RMS increases ---
+    if "entropy_acc" in row and row["entropy_acc"] is not None and "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0:
+        new["entropy_acc"] = min(1.0, max(0.05, row["entropy_acc"] * (new["rms_acc"] / row["rms_acc"]) * jitter(1, 0.1)))
+    elif "entropy_acc" in row and row["entropy_acc"] is not None:
+        new["entropy_acc"] = min(1.0, max(0.05, jitter(row["entropy_acc"], 0.02)))
+    if "entropy_gyro" in row and row["entropy_gyro"] is not None and "rms_gyro" in new and new["rms_gyro"] is not None and "rms_gyro" in row and row["rms_gyro"] is not None and row["rms_gyro"] > 0:
+        new["entropy_gyro"] = min(1.0, max(0.05, row["entropy_gyro"] * (new["rms_gyro"] / row["rms_gyro"]) * jitter(1, 0.1)))
+    elif "entropy_gyro" in row and row["entropy_gyro"] is not None:
+        new["entropy_gyro"] = min(1.0, max(0.05, jitter(row["entropy_gyro"], 0.02)))
+    # --- jerk: depends on std and RMS ---
+    if "jerk_mean" in row and row["jerk_mean"] is not None:
+        if "acc_x_std" in row and row["acc_x_std"] is not None:
+            new["jerk_mean"] = round(jitter_abs(row["jerk_mean"], row["acc_x_std"] * 0.3), 4)
         else:
+            new["jerk_mean"] = round(jitter(row["jerk_mean"], 0.02), 4)
+    if "jerk_std" in row and row["jerk_std"] is not None:
+        if "acc_x_std" in row and row["acc_x_std"] is not None:
+            new["jerk_std"] = max(0.001, round(jitter_abs(row["jerk_std"], row["acc_x_std"] * 0.1), 4))
         else:
+            new["jerk_std"] = max(0.001, round(jitter(row["jerk_std"], 0.01), 4))
+    # --- stability index (inverse to entropy) ---
+    entropy_avg = 0.5
+    if "entropy_acc" in new and new["entropy_acc"] is not None and "entropy_gyro" in new and new["entropy_gyro"] is not None:
+        entropy_avg = (new["entropy_acc"] + new["entropy_gyro"]) / 2
+    elif "entropy_acc" in new and new["entropy_acc"] is not None:
+        entropy_avg = new["entropy_acc"]
+    elif "entropy_gyro" in new and new["entropy_gyro"] is not None:
+        entropy_avg = new["entropy_gyro"]
+    new["stability_index"] = round(max(0.4, min(0.99, 1 - entropy_avg * 0.3)), 4)
+    # --- fatigue model (RMS, 주파수 기반) ---
+    # fatigue는 augment_user_data에서 시간적 연속성을 고려하여 계산
+    # 여기서는 기본값만 설정 (나중에 덮어씌워짐)
+    if "fatigue" in row and row["fatigue"] is not None:
+        # 기본적으로 RMS와 주파수 기반으로 약간 조정
+        if "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0.1:
+            rms_factor = new["rms_acc"] / row["rms_acc"]
+        else:
+            rms_factor = 1.0
+        if "mean_freq_acc" in new and new["mean_freq_acc"] is not None and "mean_freq_acc" in row and row["mean_freq_acc"] is not None and row["mean_freq_acc"] > 1:
+            freq_factor = row["mean_freq_acc"] / new["mean_freq_acc"]
+        else:
+            freq_factor = 1.0
+        fatigue_delta = rms_factor * 0.05 - freq_factor * 0.03
+        new["fatigue"] = min(0.95, max(0.05, row["fatigue"] + fatigue_delta + random.uniform(-0.02, 0.02)))
+        new["fatigue_level"] = 0 if new["fatigue"] < 0.3 else 1 if new["fatigue"] < 0.6 else 2
+    else:
+        # fatigue가 없으면 기본값 설정
+        new["fatigue"] = 0.1
+        new["fatigue_level"] = 0
+    # fatigue_prev는 augment_user_data에서 설정됨
+    if "fatigue_prev" in row and row["fatigue_prev"] is not None:
+        new["fatigue_prev"] = row["fatigue_prev"]
+    else:
+        new["fatigue_prev"] = 0.05
+    # --- baseline values (preserve) ---
+    if "rms_base" in row:
+        new["rms_base"] = row["rms_base"]
+    if "freq_base" in row:
+        new["freq_base"] = row["freq_base"]
+    # --- user_emb: NEVER change ---
+    if "user_emb" in row:
+        new["user_emb"] = row["user_emb"]
+    # --- other fields ---
+    if "overlap_rate" in row and row["overlap_rate"] is not None:
+        new["overlap_rate"] = max(0.3, min(0.7, jitter(row["overlap_rate"], 0.02)))
+    if "window_size_ms" in row:
+        new["window_size_ms"] = row.get("window_size_ms", 2000)
+    if "quality_flag" in row:
         if random.random() < 0.05:  # 5% 확률로 변경
+            new["quality_flag"] = 0 if row["quality_flag"] == 1 else 1
+        else:
+            new["quality_flag"] = row["quality_flag"]
     # session_id 약간 변형
+    if "session_id" in row and row["session_id"]:
+        parts = str(row["session_id"]).split("_")
         if len(parts) > 1:
             try:
                 session_num = int(parts[-1])
+                new["session_id"] = "_".join(parts[:-1]) + "_" + str(session_num + random.randint(-5, 5))
             except:
+                new["session_id"] = row["session_id"]
+        else:
+            new["session_id"] = row["session_id"]
+    return new
+def augment_user_data(df: pd.DataFrame, target_count: int, new_user_id: str = None) -> pd.DataFrame:
+    """
+    사용자별 데이터를 증폭하여 목표 개수만큼 생성
+    새로운 사용자인 경우 시간적 연속성을 유지
+    """
+    if len(df) >= target_count:
         return df.head(target_count)
+    need = target_count - len(df)
+    # 새로운 사용자인 경우 (기존 데이터가 없거나 새 사용자 ID가 제공된 경우)
+    is_new_user = new_user_id is not None or len(df) == 0
+    if is_new_user and len(df) > 0:
+        # 새로운 사용자는 항상 target_count만큼 생성 (참조 데이터 길이와 무관)
+        base_row = df.iloc[0].to_dict()
+        new_rows = []
+        # 시간 기반 초기값 설정
+        if "timestamp_utc" in base_row and base_row["timestamp_utc"]:
+            try:
+                base_time = datetime.fromisoformat(str(base_row["timestamp_utc"]).replace("Z", "+00:00"))
+            except:
+                base_time = datetime.now(timezone.utc)
+        else:
+            base_time = datetime.now(timezone.utc)
+        base_window_id = 1  # 새 사용자는 window_id를 1부터 시작
+        base_window_start = 0  # 새 사용자는 window_start_ms를 0부터 시작
+        prev_fatigue = base_row.get("fatigue", 0.1) if base_row.get("fatigue") is not None else 0.1
+        # 새로운 사용자는 항상 target_count만큼 생성
+        for i in range(target_count):
+            # 샘플 레코드 선택
+            sample_idx = random.randint(0, len(df) - 1)
+            sample = df.iloc[sample_idx].to_dict()
+            # 새로운 레코드 생성
+            new_row = augment_record_strict(sample)
+            # 새로운 사용자 ID 설정
+            if new_user_id:
+                new_row["user_id"] = new_user_id
+            # 시간적 연속성 유지
+            window_interval = 2000  # window_size_ms
+            new_row["window_id"] = base_window_id + i
+            new_row["window_start_ms"] = base_window_start + i * window_interval
+            new_row["window_end_ms"] = new_row["window_start_ms"] + window_interval
+            # timestamp 연속성 유지
+            new_row["timestamp_utc"] = (base_time + timedelta(milliseconds=i * window_interval)).isoformat()
+            # 피로도 연속성 유지 (이전 피로도는 직전 레코드의 피로도)
+            if i > 0:
+                new_row["fatigue_prev"] = prev_fatigue
+            else:
+                # 첫 레코드는 참조 데이터의 피로도에서 약간 낮게 시작
+                new_row["fatigue_prev"] = max(0.05, prev_fatigue - random.uniform(0, 0.05))
+            # 현재 피로도는 이전 피로도 기반으로 약간 증가하는 경향 (실제 측정과 유사)
+            if "fatigue" in new_row and new_row["fatigue"] is not None:
+                # 피로도는 시간에 따라 점진적으로 증가하는 경향
+                fatigue_base = new_row["fatigue_prev"] if "fatigue_prev" in new_row else prev_fatigue
+                # 약간의 증가 + 노이즈
+                fatigue_increase = random.uniform(0, 0.02)  # 시간에 따른 점진적 증��
+                new_row["fatigue"] = min(0.95, max(0.05, fatigue_base + fatigue_increase + random.uniform(-0.01, 0.01)))
+                new_row["fatigue_level"] = 0 if new_row["fatigue"] < 0.3 else 1 if new_row["fatigue"] < 0.6 else 2
+                prev_fatigue = new_row["fatigue"]
+            # 세션 ID 생성 (새 사용자이므로 새로운 세션)
+            if "session_id" in new_row:
+                new_row["session_id"] = f"session_{i // 10 + 1:03d}"  # 10개 레코드당 세션
+            # measure_date는 기존 데이터에 있는 경우에만 설정
+            if "measure_date" in sample:
+                try:
+                    measure_time = datetime.fromisoformat(new_row["timestamp_utc"].replace("Z", "+00:00"))
+                    new_row["measure_date"] = measure_time.strftime("%Y-%m-%d")
+                except:
+                    new_row["measure_date"] = base_time.strftime("%Y-%m-%d")
+            new_rows.append(new_row)
+        return pd.DataFrame(new_rows)
+    else:
+        # 기존 사용자 데이터 증폭 (시간적 연속성 유지)
+        new_rows = []
+        last_row = df.iloc[-1].to_dict()
+        # 마지막 레코드의 시간 정보 가져오기
+        if "timestamp_utc" in last_row and last_row["timestamp_utc"]:
+            try:
+                last_time = datetime.fromisoformat(str(last_row["timestamp_utc"]).replace("Z", "+00:00"))
+            except:
+                last_time = datetime.now(timezone.utc)
+        else:
+            last_time = datetime.now(timezone.utc)
+        last_window_id = last_row.get("window_id", 0) if last_row.get("window_id") is not None else 0
+        last_window_start = last_row.get("window_end_ms", 0) if last_row.get("window_end_ms") is not None else 0
+        prev_fatigue = last_row.get("fatigue", 0.1) if last_row.get("fatigue") is not None else 0.1
+        for i in range(need):
+            # 샘플 레코드 선택
+            sample_idx = random.randint(0, len(df) - 1)
+            sample = df.iloc[sample_idx].to_dict()
+            # 새로운 레코드 생성
+            new_row = augment_record_strict(sample)
+            # 시간적 연속성 유지
+            window_interval = 2000
+            new_row["window_id"] = last_window_id + i + 1
+            new_row["window_start_ms"] = last_window_start + i * window_interval
+            new_row["window_end_ms"] = new_row["window_start_ms"] + window_interval
+            # timestamp 연속성 유지
+            new_row["timestamp_utc"] = (last_time + timedelta(milliseconds=(i + 1) * window_interval)).isoformat()
+            # 피로도 연속성 유지
+            new_row["fatigue_prev"] = prev_fatigue
+            if "fatigue" in new_row and new_row["fatigue"] is not None:
+                # 피로도는 시간에 따라 점진적으로 증가하는 경향
+                fatigue_increase = random.uniform(0, 0.02)  # 시간에 따른 점진적 증가
+                new_row["fatigue"] = min(0.95, max(0.05, prev_fatigue + fatigue_increase + random.uniform(-0.01, 0.01)))
+                new_row["fatigue_level"] = 0 if new_row["fatigue"] < 0.3 else 1 if new_row["fatigue"] < 0.6 else 2
+                prev_fatigue = new_row["fatigue"]
+            # measure_date는 기존 데이터에 있는 경우에만 설정
+            if "measure_date" in sample:
+                try:
+                    measure_time = datetime.fromisoformat(new_row["timestamp_utc"].replace("Z", "+00:00"))
+                    new_row["measure_date"] = measure_time.strftime("%Y-%m-%d")
+                except:
+                    new_row["measure_date"] = last_time.strftime("%Y-%m-%d")
+            new_rows.append(new_row)
+        return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
 def main():
                     else:
                         user_id = filename_no_ext
+                    # local_user로 시작하는 파일은 제외
+                    if user_id.startswith("local_user"):
+                        print(f"⏭️ {user_id}: local_user로 시작하는 파일은 제외")
+                        continue
                     # 개별 파일을 pandas로 직접 로드
                     from huggingface_hub import hf_hub_download
                     import tempfile
         print(f"❌ 데이터셋 로드 완전 실패: {e3}")
         return
+    # 유효한 사용자만 필터링 (데이터가 있는 사용자만, local_user 제외)
     valid_users = {}
     for user_id in existing.keys():
+        # local_user로 시작하는 사용자는 제외
+        if user_id.startswith("local_user"):
+            print(f"⏭️ {user_id}: local_user로 시작하는 사용자는 제외")
+            continue
         try:
             user_data = existing[user_id]
             if len(user_data) > 0:
             continue
         try:
+            # 참조 데이터를 증폭하여 새로운 사용자 데이터 생성 (새 사용자 ID 전달)
+            new_user_df = augment_user_data(reference_df, RECORDS_PER_USER, new_user_id=new_user_id)
+            # user_id 컬럼이 없으면 추가
+            if "user_id" not in new_user_df.columns:
+                new_user_df["user_id"] = new_user_id
+            else:
+                new_user_df["user_id"] = new_user_id
             new_user_datasets[new_user_id] = Dataset.from_pandas(new_user_df, preserve_index=False)
+            actual_count = len(new_user_df)
+            print(f"📈 {new_user_id}: {actual_count} 레코드 생성 (참조: {reference_user_id}, 목표: {RECORDS_PER_USER})")
+            if actual_count != RECORDS_PER_USER:
+                print(f"   ⚠️ 경고: 생성된 레코드 수({actual_count})가 목표({RECORDS_PER_USER})와 다릅니다!")
         except Exception as e:
             print(f"❌ {new_user_id}: 생성 실패 ({e}), 건너뜀")
             continue
         print("❌ 새로운 사용자 데이터가 생성되지 않았습니다.")
         return
+    # 기존 데이터의 스키마 확인 (첫 번째 사용자 데이터 기준)
+    print("🔧 기존 데이터 스키마 확인 중...")
+    reference_user_id = list(valid_users.keys())[0]
+    reference_df = valid_users[reference_user_id].to_pandas()
+    existing_columns = set(reference_df.columns)
+    print(f"  📋 기존 데이터 컬럼 수: {len(existing_columns)}")
+    print(f"  📋 기존 데이터 컬럼: {sorted(existing_columns)}")
+    # 새로운 사용자 데이터를 기존 스키마에 맞춤
+    print("🔧 새로운 사용자 데이터를 기존 스키마에 맞추는 중...")
+    for user_id in new_user_datasets.keys():
+        df = new_user_datasets[user_id].to_pandas()
+        # 기존에 없는 컬럼 제거
+        columns_to_remove = set(df.columns) - existing_columns
+        if columns_to_remove:
+            df = df.drop(columns=list(columns_to_remove))
+            print(f"  ⚠️ {user_id}: 불필요한 컬럼 제거: {columns_to_remove}")
+        # 기존에 있는데 없는 컬럼 추가 (None으로)
+        columns_to_add = existing_columns - set(df.columns)
+        if columns_to_add:
+            for col in columns_to_add:
+                df[col] = None
+            print(f"  ➕ {user_id}: 누락된 컬럼 추가: {columns_to_add}")
+        # 컬럼 순서를 기존 데이터와 동일하게 맞춤
+        df = df[list(reference_df.columns)]
+        new_user_datasets[user_id] = Dataset.from_pandas(df, preserve_index=False)
+        print(f"  ✅ {user_id}: 스키마 정규화 완료")
     # 기존 데이터셋에 새로운 사용자 데이터 추가
     final_datasets = {}
     # 기존 사용자 데이터 유지
     print(f"📊 전체 데이터셋 총 레코드 수: {total_records}")
     print(f"📊 새로운 parquet 파일 수: {len(new_user_datasets)}개")
+    # local_user로 시작하는 파일 삭제
+    print("🗑️ local_user로 시작하는 파일 삭제 중...")
+    try:
+        files_to_delete = []
+        for file_path in parquet_files:
+            filename = file_path.split("/")[-1] if "/" in file_path else file_path
+            filename_no_ext = filename.replace(".parquet", "")
+            # -00000-of-00001 부분이 있으면 제거
+            if "-" in filename_no_ext:
+                user_id = filename_no_ext.split("-")[0]
+            else:
+                user_id = filename_no_ext
+            if user_id.startswith("local_user"):
+                files_to_delete.append(file_path)
+        for file_path in files_to_delete:
+            try:
+                api.delete_file(path_in_repo=file_path, repo_id=repo_id, repo_type="dataset", token=token)
+                print(f"  ✅ 삭제: {file_path}")
+            except Exception as e:
+                print(f"  ⚠️ 삭제 실패 ({file_path}): {str(e)[:100]}")
+        if files_to_delete:
+            print(f"🗑️ {len(files_to_delete)}개 파일 삭제 완료")
+        else:
+            print("ℹ️ 삭제할 local_user 파일이 없습니다")
+    except Exception as e:
+        print(f"⚠️ 파일 삭제 중 오류 발생: {str(e)[:100]}")
     print(f"📤 Hugging Face Hub에 업로드 중: {repo_id}")
     final_dict.push_to_hub(repo_id, token=token, private=True)
     print("✅ 업로드 완료")