Merry99 commited on
Commit
e2b68a6
ยท
1 Parent(s): 500a872

add augment py

Browse files
Files changed (1) hide show
  1. augment_dataset.py +416 -183
augment_dataset.py CHANGED
@@ -22,222 +22,376 @@ def require_env(var_name: str) -> str:
22
  return value
23
 
24
 
25
- def add_noise(value: float, noise_scale: float) -> float:
26
- """๊ฐ’์— ๋…ธ์ด์ฆˆ ์ถ”๊ฐ€"""
27
  if value is None:
28
  return None
29
- return round(value + random.uniform(-noise_scale, noise_scale), 4)
30
 
31
 
32
- def bounded(value: float, low: float, high: float) -> float:
33
- """๊ฐ’์„ ๋ฒ”์œ„ ๋‚ด๋กœ ์ œํ•œ"""
34
  if value is None:
35
  return None
36
- return max(low, min(high, value))
37
 
38
 
39
- def augment_record(original: dict, noise_scale: float = 0.1) -> dict:
40
- """๋‹จ์ผ ๋ ˆ์ฝ”๋“œ๋ฅผ ์ฆํญ (๋ฌผ๋ฆฌ์  ๊ด€๊ณ„์™€ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๊ณ ๋ คํ•œ ์˜๋ฏธ์žˆ๋Š” ์ฆํญ)"""
41
- augmented = original.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # ์‹œ๊ฐ„ ์ •๋ณด ๋ณ€ํ˜• (์—ฐ์†์„ฑ ์œ ์ง€)
44
- if "timestamp_utc" in augmented and augmented["timestamp_utc"]:
45
  try:
46
- base_time = datetime.fromisoformat(augmented["timestamp_utc"].replace("Z", "+00:00"))
47
- time_delta = timedelta(milliseconds=random.randint(-200, 200))
48
- augmented["timestamp_utc"] = (base_time + time_delta).isoformat()
49
  except:
50
  pass
51
 
52
- # window_id์™€ ์‹œ๊ฐ„ ๋ฒ”์œ„ ์•ฝ๊ฐ„ ์กฐ์ • (์—ฐ์†์„ฑ ์œ ์ง€)
53
- if "window_id" in augmented:
54
- augmented["window_id"] = augmented["window_id"] + random.randint(-1, 1)
55
- if "window_start_ms" in augmented:
56
- augmented["window_start_ms"] = augmented["window_start_ms"] + random.randint(-50, 50)
57
- if "window_end_ms" in augmented:
58
- augmented["window_end_ms"] = augmented["window_start_ms"] + 2000 # window_size_ms์™€ ์ผ์น˜
59
-
60
- # ๊ฐ€์†๋„๊ณ„ ๋ฐ์ดํ„ฐ ์ฆํญ (x, y, z ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„ ์œ ์ง€)
61
- acc_noise = random.uniform(-noise_scale * 0.1, noise_scale * 0.1)
62
- if "acc_x_mean" in augmented and augmented["acc_x_mean"] is not None:
63
- augmented["acc_x_mean"] = add_noise(augmented["acc_x_mean"], abs(augmented["acc_x_mean"]) * 0.1 + 0.01)
64
- if "acc_y_mean" in augmented and augmented["acc_y_mean"] is not None:
65
- augmented["acc_y_mean"] = add_noise(augmented["acc_y_mean"], abs(augmented["acc_y_mean"]) * 0.1 + 0.01)
66
- if "acc_z_mean" in augmented and augmented["acc_z_mean"] is not None:
67
- augmented["acc_z_mean"] = add_noise(augmented["acc_z_mean"], abs(augmented["acc_z_mean"]) * 0.1 + 0.01)
68
-
69
- # ์ž์ด๋กœ์Šค์ฝ”ํ”„ ๋ฐ์ดํ„ฐ ์ฆํญ
70
- gyro_noise = random.uniform(-noise_scale * 0.02, noise_scale * 0.02)
71
- if "gyro_x_mean" in augmented and augmented["gyro_x_mean"] is not None:
72
- augmented["gyro_x_mean"] = add_noise(augmented["gyro_x_mean"], 0.005)
73
- if "gyro_y_mean" in augmented and augmented["gyro_y_mean"] is not None:
74
- augmented["gyro_y_mean"] = add_noise(augmented["gyro_y_mean"], 0.005)
75
- if "gyro_z_mean" in augmented and augmented["gyro_z_mean"] is not None:
76
- augmented["gyro_z_mean"] = add_noise(augmented["gyro_z_mean"], 0.005)
77
-
78
- # ์„ ํ˜• ๊ฐ€์†๋„ ์ฆํญ
79
- if "linacc_x_mean" in augmented and augmented["linacc_x_mean"] is not None:
80
- augmented["linacc_x_mean"] = add_noise(augmented["linacc_x_mean"], abs(augmented["linacc_x_mean"]) * 0.1 + 0.01)
81
- if "linacc_y_mean" in augmented and augmented["linacc_y_mean"] is not None:
82
- augmented["linacc_y_mean"] = add_noise(augmented["linacc_y_mean"], abs(augmented["linacc_y_mean"]) * 0.1 + 0.01)
83
- if "linacc_z_mean" in augmented and augmented["linacc_z_mean"] is not None:
84
- augmented["linacc_z_mean"] = add_noise(augmented["linacc_z_mean"], abs(augmented["linacc_z_mean"]) * 0.1 + 0.01)
85
-
86
- # ์ค‘๋ ฅ ๋ฒกํ„ฐ ์ฆํญ (๋ฌผ๋ฆฌ์  ์ œ์•ฝ: ํฌ๊ธฐ๊ฐ€ ์•ฝ 9.8์— ๊ฐ€๊นŒ์›Œ์•ผ ํ•จ)
87
- if all(f in augmented and augmented[f] is not None for f in ["gravity_x_mean", "gravity_y_mean", "gravity_z_mean"]):
88
- gx = augmented["gravity_x_mean"] + random.uniform(-0.01, 0.01)
89
- gy = augmented["gravity_y_mean"] + random.uniform(-0.01, 0.01)
90
- gz = augmented["gravity_z_mean"] + random.uniform(-0.02, 0.02)
91
- # ์ค‘๋ ฅ ๋ฒกํ„ฐ ํฌ๊ธฐ ์ •๊ทœํ™” (์•ฝ 9.8 ์œ ์ง€)
92
  g_mag = np.sqrt(gx**2 + gy**2 + gz**2)
93
  if g_mag > 0:
94
  scale = 9.8 / g_mag
95
- augmented["gravity_x_mean"] = round(gx * scale, 4)
96
- augmented["gravity_y_mean"] = round(gy * scale, 4)
97
- augmented["gravity_z_mean"] = round(gz * scale, 4)
98
-
99
- # ์„ผ์„œ ํ‘œ์ค€ํŽธ์ฐจ ์ฆํญ (RMS์™€ ์ผ๊ด€์„ฑ ์œ ์ง€)
100
- sensor_std_fields = [
101
- "acc_x_std", "acc_y_std", "acc_z_std",
102
- "gyro_x_std", "gyro_y_std", "gyro_z_std",
103
- ]
104
- for field in sensor_std_fields:
105
- if field in augmented and augmented[field] is not None:
106
- augmented[field] = bounded(add_noise(augmented[field], augmented[field] * 0.1), 0.01, 1.0)
107
-
108
- # RMS ๊ฐ’ ์ฆํญ (์„ผ์„œ ํ‰๊ท ๊ฐ’๊ณผ ์ผ๊ด€์„ฑ ์œ ์ง€)
109
- if "rms_acc" in augmented and augmented["rms_acc"] is not None:
110
- # RMS๋Š” ๊ฐ€์†๋„ ํ‰๊ท ๊ฐ’์˜ ํฌ๊ธฐ์™€ ๊ด€๋ จ
111
- acc_mag = np.sqrt(
112
- (augmented.get("acc_x_mean", 0) or 0)**2 +
113
- (augmented.get("acc_y_mean", 0) or 0)**2 +
114
- (augmented.get("acc_z_mean", 0) or 0)**2
115
  )
116
- rms_base = augmented["rms_acc"]
117
- # RMS๋Š” ์›๋ณธ๊ณผ ๋น„์Šทํ•œ ๋ฒ”์œ„ ์œ ์ง€
118
- augmented["rms_acc"] = bounded(add_noise(rms_base, rms_base * 0.1), 0.1, 2.0)
119
-
120
- if "rms_gyro" in augmented and augmented["rms_gyro"] is not None:
121
- gyro_mag = np.sqrt(
122
- (augmented.get("gyro_x_mean", 0) or 0)**2 +
123
- (augmented.get("gyro_y_mean", 0) or 0)**2 +
124
- (augmented.get("gyro_z_mean", 0) or 0)**2
125
  )
126
- rms_gyro_base = augmented["rms_gyro"]
127
- augmented["rms_gyro"] = bounded(add_noise(rms_gyro_base, rms_gyro_base * 0.1), 0.01, 0.5)
128
-
129
- # ์ฃผํŒŒ์ˆ˜ ์ฆํญ (RMS์™€ ์ƒ๊ด€๊ด€๊ณ„ ์œ ์ง€)
130
- if "mean_freq_acc" in augmented and augmented["mean_freq_acc"] is not None:
131
- # RMS๊ฐ€ ๋†’์œผ๋ฉด ์ฃผํŒŒ์ˆ˜๋„ ์•ฝ๊ฐ„ ๋†’์•„์ง€๋Š” ๊ฒฝํ–ฅ
132
- freq_factor = 1.0 + (augmented.get("rms_acc", 0) or 0) * 0.1
133
- augmented["mean_freq_acc"] = round(add_noise(augmented["mean_freq_acc"] * freq_factor, 1.0) / freq_factor, 2)
134
-
135
- if "mean_freq_gyro" in augmented and augmented["mean_freq_gyro"] is not None:
136
- freq_factor = 1.0 + (augmented.get("rms_gyro", 0) or 0) * 0.2
137
- augmented["mean_freq_gyro"] = round(add_noise(augmented["mean_freq_gyro"] * freq_factor, 0.5) / freq_factor, 2)
138
-
139
- # ์—”ํŠธ๋กœํ”ผ ์ฆํญ (์•ˆ์ •์„ฑ๊ณผ ๊ด€๋ จ)
140
- if "entropy_acc" in augmented and augmented["entropy_acc"] is not None:
141
- augmented["entropy_acc"] = bounded(add_noise(augmented["entropy_acc"], 0.02), 0.1, 1.0)
142
- if "entropy_gyro" in augmented and augmented["entropy_gyro"] is not None:
143
- augmented["entropy_gyro"] = bounded(add_noise(augmented["entropy_gyro"], 0.02), 0.1, 1.0)
144
-
145
- # Jerk ์ฆํญ (๊ฐ€์†๋„ ๋ณ€ํ™”์œจ)
146
- if "jerk_mean" in augmented and augmented["jerk_mean"] is not None:
147
- augmented["jerk_mean"] = add_noise(augmented["jerk_mean"], 0.01)
148
- if "jerk_std" in augmented and augmented["jerk_std"] is not None:
149
- augmented["jerk_std"] = bounded(add_noise(augmented["jerk_std"], 0.005), 0.01, 0.2)
150
-
151
- # ์•ˆ์ •์„ฑ ์ง€์ˆ˜ ์ฆํญ (์—”ํŠธ๋กœํ”ผ์™€ ๋ฐ˜๋น„๋ก€ ๊ด€๊ณ„)
152
- if "stability_index" in augmented and augmented["stability_index"] is not None:
153
- # ์—”ํŠธ๋กœํ”ผ๊ฐ€ ๋†’์œผ๋ฉด ์•ˆ์ •์„ฑ์ด ๋‚ฎ์•„์ง
154
- entropy_avg = ((augmented.get("entropy_acc", 0.5) or 0.5) + (augmented.get("entropy_gyro", 0.5) or 0.5)) / 2
155
- stability_base = 1.0 - entropy_avg * 0.3 # ์—”ํŠธ๋กœํ”ผ ๊ธฐ๋ฐ˜ ์ถ”์ •
156
- augmented["stability_index"] = bounded(add_noise(stability_base, 0.02), 0.4, 0.99)
157
-
158
- # ํ”ผ๋กœ๋„ ์ฆํญ (RMS, ์ฃผํŒŒ์ˆ˜์™€ ์ƒ๊ด€๊ด€๊ณ„)
159
- if "fatigue" in augmented and augmented["fatigue"] is not None:
160
- # RMS๊ฐ€ ๋†’๊ณ  ์ฃผํŒŒ์ˆ˜๊ฐ€ ๋‚ฎ์œผ๋ฉด ํ”ผ๋กœ๋„ ์ฆ๊ฐ€
161
- rms_factor = (augmented.get("rms_acc", 0) or 0) / (augmented.get("rms_base", 1.0) or 1.0)
162
- freq_factor = (augmented.get("mean_freq_acc", 40) or 40) / (augmented.get("freq_base", 40) or 40)
163
- fatigue_delta = (rms_factor - 1.0) * 0.05 - (freq_factor - 1.0) * 0.03 + random.uniform(-0.03, 0.03)
164
- augmented["fatigue"] = bounded(augmented["fatigue"] + fatigue_delta, 0.05, 0.95)
165
- augmented["fatigue_level"] = 0 if augmented["fatigue"] < 0.3 else 1 if augmented["fatigue"] < 0.6 else 2
166
-
167
- # ์ด์ „ ํ”ผ๋กœ๋„๋Š” ํ˜„์žฌ ํ”ผ๋กœ๋„์™€ ์—ฐ์†์„ฑ ์œ ์ง€
168
- if "fatigue_prev" in augmented and augmented["fatigue_prev"] is not None:
169
- if "fatigue" in augmented and augmented["fatigue"] is not None:
170
- # ์ด์ „ ํ”ผ๋กœ๋„๋Š” ํ˜„์žฌ ํ”ผ๋กœ๋„๋ณด๋‹ค ์•ฝ๊ฐ„ ๋‚ฎ๊ฑฐ๋‚˜ ๋น„์Šท
171
- augmented["fatigue_prev"] = bounded(augmented["fatigue"] - random.uniform(0, 0.1), 0.05, 0.95)
172
  else:
173
- augmented["fatigue_prev"] = bounded(add_noise(augmented["fatigue_prev"], 0.02), 0.05, 0.95)
174
 
175
- # user_emb ๋ฒกํ„ฐ์— ์ž‘์€ ๋…ธ์ด์ฆˆ ์ถ”๊ฐ€
176
- if "user_emb" in augmented and augmented["user_emb"] is not None:
177
- if isinstance(augmented["user_emb"], str):
178
- try:
179
- emb_list = json.loads(augmented["user_emb"])
180
- except:
181
- emb_list = augmented["user_emb"]
182
  else:
183
- emb_list = augmented["user_emb"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- if isinstance(emb_list, list) and len(emb_list) > 0:
186
- augmented["user_emb"] = [round(v + random.uniform(-0.01, 0.01), 4) for v in emb_list]
187
-
188
- # overlap_rate ์•ฝ๊ฐ„ ๋ณ€ํ˜•
189
- if "overlap_rate" in augmented and augmented["overlap_rate"] is not None:
190
- augmented["overlap_rate"] = bounded(add_noise(augmented["overlap_rate"], 0.02), 0.3, 0.7)
191
-
192
- # quality_flag๋Š” ๊ฐ€๋” ๋ณ€๊ฒฝ
193
- if "quality_flag" in augmented:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  if random.random() < 0.05: # 5% ํ™•๋ฅ ๋กœ ๋ณ€๊ฒฝ
195
- augmented["quality_flag"] = 0 if augmented["quality_flag"] == 1 else 1
 
 
196
 
197
  # session_id ์•ฝ๊ฐ„ ๋ณ€ํ˜•
198
- if "session_id" in augmented and augmented["session_id"]:
199
- parts = augmented["session_id"].split("_")
200
  if len(parts) > 1:
201
  try:
202
  session_num = int(parts[-1])
203
- augmented["session_id"] = "_".join(parts[:-1]) + "_" + str(session_num + random.randint(-5, 5))
204
  except:
205
- pass
 
 
206
 
207
- return augmented
208
 
209
 
210
- def augment_user_data(df: pd.DataFrame, target_count: int) -> pd.DataFrame:
211
- """์‚ฌ์šฉ์ž๋ณ„ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆํญํ•˜์—ฌ ๋ชฉํ‘œ ๊ฐœ์ˆ˜๋งŒํผ ์ƒ์„ฑ"""
212
- current_count = len(df)
213
- if current_count == 0:
214
- return df
215
-
216
- if current_count >= target_count:
217
- # ์ด๋ฏธ ์ถฉ๋ถ„ํ•˜๋ฉด ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜
218
  return df.head(target_count)
219
 
220
- # ์ฆํญ์ด ํ•„์š”ํ•œ ๊ฐœ์ˆ˜
221
- needed = target_count - current_count
222
 
223
- # ๊ธฐ์กด ๋ฐ์ดํ„ฐ๋ฅผ ๋ณต์ œํ•˜๊ณ  ์ฆํญ
224
- augmented_records = []
225
- for _ in range(needed):
226
- # ๋žœ๋คํ•˜๊ฒŒ ์›๋ณธ ๋ ˆ์ฝ”๋“œ ์„ ํƒ
227
- original_idx = random.randint(0, current_count - 1)
228
- original = df.iloc[original_idx].to_dict()
229
-
230
- # ์ฆํญ (๋…ธ์ด์ฆˆ ์Šค์ผ€์ผ์€ ํ•„๋“œ์— ๋”ฐ๋ผ ๋‹ค๋ฅด๊ฒŒ)
231
- noise_scale = random.uniform(0.05, 0.15)
232
- augmented = augment_record(original, noise_scale)
233
- augmented_records.append(augmented)
234
 
235
- # ์ฆํญ๋œ ๋ฐ์ดํ„ฐ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜
236
- augmented_df = pd.DataFrame(augmented_records)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- # ์›๋ณธ๊ณผ ๋ณ‘ํ•ฉ
239
- result_df = pd.concat([df, augmented_df], ignore_index=True)
240
- return result_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
 
243
  def main():
@@ -270,6 +424,11 @@ def main():
270
  else:
271
  user_id = filename_no_ext
272
 
 
 
 
 
 
273
  # ๊ฐœ๋ณ„ ํŒŒ์ผ์„ pandas๋กœ ์ง์ ‘ ๋กœ๋“œ
274
  from huggingface_hub import hf_hub_download
275
  import tempfile
@@ -296,9 +455,13 @@ def main():
296
  print(f"โŒ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์™„์ „ ์‹คํŒจ: {e3}")
297
  return
298
 
299
- # ์œ ํšจํ•œ ์‚ฌ์šฉ์ž๋งŒ ํ•„ํ„ฐ๋ง (๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š” ์‚ฌ์šฉ์ž๋งŒ)
300
  valid_users = {}
301
  for user_id in existing.keys():
 
 
 
 
302
  try:
303
  user_data = existing[user_id]
304
  if len(user_data) > 0:
@@ -346,10 +509,18 @@ def main():
346
  continue
347
 
348
  try:
349
- # ์ฐธ์กฐ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆํญํ•˜์—ฌ ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
350
- new_user_df = augment_user_data(reference_df, RECORDS_PER_USER)
 
 
 
 
 
351
  new_user_datasets[new_user_id] = Dataset.from_pandas(new_user_df, preserve_index=False)
352
- print(f"๐Ÿ“ˆ {new_user_id}: {RECORDS_PER_USER} ๋ ˆ์ฝ”๋“œ ์ƒ์„ฑ (์ฐธ์กฐ: {reference_user_id})")
 
 
 
353
  except Exception as e:
354
  print(f"โŒ {new_user_id}: ์ƒ์„ฑ ์‹คํŒจ ({e}), ๊ฑด๋„ˆ๋œ€")
355
  continue
@@ -358,6 +529,38 @@ def main():
358
  print("โŒ ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ๊ฐ€ ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
359
  return
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  # ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹์— ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
362
  final_datasets = {}
363
  # ๊ธฐ์กด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์œ ์ง€
@@ -374,6 +577,36 @@ def main():
374
  print(f"๐Ÿ“Š ์ „์ฒด ๋ฐ์ดํ„ฐ์…‹ ์ด ๋ ˆ์ฝ”๋“œ ์ˆ˜: {total_records}")
375
  print(f"๐Ÿ“Š ์ƒˆ๋กœ์šด parquet ํŒŒ์ผ ์ˆ˜: {len(new_user_datasets)}๊ฐœ")
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  print(f"๐Ÿ“ค Hugging Face Hub์— ์—…๋กœ๋“œ ์ค‘: {repo_id}")
378
  final_dict.push_to_hub(repo_id, token=token, private=True)
379
  print("โœ… ์—…๋กœ๋“œ ์™„๋ฃŒ")
 
22
  return value
23
 
24
 
25
+ def jitter(value: float, scale: float = 0.02) -> float:
26
+ """๊ฐ’์— ยฑscale ๋น„์œจ์˜ ๋…ธ์ด์ฆˆ๋ฅผ ์ถ”๊ฐ€"""
27
  if value is None:
28
  return None
29
+ return value * (1 + random.uniform(-scale, scale))
30
 
31
 
32
+ def jitter_abs(value: float, amount: float) -> float:
33
+ """์ ˆ๋Œ€๊ฐ’ ๊ธฐ์ค€ ๋…ธ์ด์ฆˆ ์ถ”๊ฐ€"""
34
  if value is None:
35
  return None
36
+ return value + random.uniform(-amount, amount)
37
 
38
 
39
+ def augment_sensor_vector(x: float, y: float, z: float, noise: float = 0.02) -> tuple:
40
+ """
41
+ 3์ถ• ์„ผ์„œ ๋ฐ์ดํ„ฐ๋ฅผ ๋ฌผ๋ฆฌ์ ์œผ๋กœ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ์ฆํญ
42
+ โ†’ 3์ถ•์€ ๋™์ผํ•œ ๋น„์œจ๋กœ scaling + ๊ฐœ๋ณ„ ์ž‘์€ ๋…ธ์ด์ฆˆ
43
+ """
44
+ if x is None or y is None or z is None:
45
+ return (x, y, z)
46
+ scale = 1 + random.uniform(-noise, noise)
47
+ return (
48
+ round(x * scale + random.uniform(-0.01, 0.01), 4),
49
+ round(y * scale + random.uniform(-0.01, 0.01), 4),
50
+ round(z * scale + random.uniform(-0.01, 0.01), 4),
51
+ )
52
+
53
+
54
+ def compute_rms(x: float, y: float, z: float, base_noise: float = 0.02) -> float:
55
+ """3์ถ• mean ๊ธฐ๋ฐ˜์œผ๋กœ RMS ์žฌ๊ณ„์‚ฐ"""
56
+ if x is None or y is None or z is None:
57
+ return None
58
+ base = np.sqrt(x**2 + y**2 + z**2)
59
+ return round(base * (1 + random.uniform(-base_noise, base_noise)), 4)
60
+
61
+
62
+ def augment_record_strict(row: dict) -> dict:
63
+ """๋ฌผ๋ฆฌ์  ์ œ์•ฝ์„ ์ง€ํ‚ค๋ฉด์„œ ์„ผ์„œ ๋ฐ์ดํ„ฐ๋ฅผ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ์ฆํญ"""
64
+ new = row.copy()
65
 
66
+ # timestamp jitter
67
+ if "timestamp_utc" in row and isinstance(row["timestamp_utc"], str):
68
  try:
69
+ t = datetime.fromisoformat(row["timestamp_utc"].replace("Z", "+00:00"))
70
+ t = t + timedelta(milliseconds=random.randint(-150, 150))
71
+ new["timestamp_utc"] = t.isoformat()
72
  except:
73
  pass
74
 
75
+ # window jitter
76
+ if "window_id" in row and row["window_id"] is not None:
77
+ new["window_id"] = int(row["window_id"] + random.randint(-1, 1))
78
+ if "window_start_ms" in row and row["window_start_ms"] is not None:
79
+ new["window_start_ms"] = row["window_start_ms"] + random.randint(-50, 50)
80
+ if "window_end_ms" in row and row["window_end_ms"] is not None:
81
+ new["window_end_ms"] = new["window_start_ms"] + 2000 # window_size_ms์™€ ์ผ์น˜
82
+
83
+ # --- Accelerometer mean ---
84
+ if all(f in row and row[f] is not None for f in ["acc_x_mean", "acc_y_mean", "acc_z_mean"]):
85
+ new["acc_x_mean"], new["acc_y_mean"], new["acc_z_mean"] = augment_sensor_vector(
86
+ row["acc_x_mean"], row["acc_y_mean"], row["acc_z_mean"], noise=0.03
87
+ )
88
+
89
+ # --- Gyro mean ---
90
+ if all(f in row and row[f] is not None for f in ["gyro_x_mean", "gyro_y_mean", "gyro_z_mean"]):
91
+ new["gyro_x_mean"], new["gyro_y_mean"], new["gyro_z_mean"] = augment_sensor_vector(
92
+ row["gyro_x_mean"], row["gyro_y_mean"], row["gyro_z_mean"], noise=0.03
93
+ )
94
+
95
+ # --- Linear accel mean ---
96
+ if all(f in row and row[f] is not None for f in ["linacc_x_mean", "linacc_y_mean", "linacc_z_mean"]):
97
+ new["linacc_x_mean"], new["linacc_y_mean"], new["linacc_z_mean"] = augment_sensor_vector(
98
+ row["linacc_x_mean"], row["linacc_y_mean"], row["linacc_z_mean"], noise=0.03
99
+ )
100
+
101
+ # --- Gravity vector (๋ฌผ๋ฆฌ์  ์ œ์•ฝ: ํฌ๊ธฐ๊ฐ€ ์•ฝ 9.8) ---
102
+ if all(f in row and row[f] is not None for f in ["gravity_x_mean", "gravity_y_mean", "gravity_z_mean"]):
103
+ gx, gy, gz = augment_sensor_vector(
104
+ row["gravity_x_mean"], row["gravity_y_mean"], row["gravity_z_mean"], noise=0.01
105
+ )
 
 
 
 
 
 
 
 
 
106
  g_mag = np.sqrt(gx**2 + gy**2 + gz**2)
107
  if g_mag > 0:
108
  scale = 9.8 / g_mag
109
+ new["gravity_x_mean"] = round(gx * scale, 4)
110
+ new["gravity_y_mean"] = round(gy * scale, 4)
111
+ new["gravity_z_mean"] = round(gz * scale, 4)
112
+
113
+ # --- Recompute RMS from sensor means ---
114
+ if all(f in new and new[f] is not None for f in ["acc_x_mean", "acc_y_mean", "acc_z_mean"]):
115
+ new["rms_acc"] = compute_rms(
116
+ new["acc_x_mean"], new["acc_y_mean"], new["acc_z_mean"], base_noise=0.03
 
 
 
 
 
 
 
 
 
 
 
 
117
  )
118
+ elif "rms_acc" in row and row["rms_acc"] is not None:
119
+ new["rms_acc"] = jitter(row["rms_acc"], 0.03)
120
+
121
+ if all(f in new and new[f] is not None for f in ["gyro_x_mean", "gyro_y_mean", "gyro_z_mean"]):
122
+ new["rms_gyro"] = compute_rms(
123
+ new["gyro_x_mean"], new["gyro_y_mean"], new["gyro_z_mean"], base_noise=0.03
 
 
 
124
  )
125
+ elif "rms_gyro" in row and row["rms_gyro"] is not None:
126
+ new["rms_gyro"] = jitter(row["rms_gyro"], 0.03)
127
+
128
+ # --- std values scale with RMS ---
129
+ if "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0:
130
+ rms_ratio = new["rms_acc"] / row["rms_acc"]
131
+ for col in ["acc_x_std", "acc_y_std", "acc_z_std"]:
132
+ if col in row and row[col] is not None:
133
+ new[col] = max(0.01, row[col] * rms_ratio * jitter(1, 0.1))
134
+
135
+ if "rms_gyro" in new and new["rms_gyro"] is not None and "rms_gyro" in row and row["rms_gyro"] is not None and row["rms_gyro"] > 0:
136
+ rms_ratio = new["rms_gyro"] / row["rms_gyro"]
137
+ for col in ["gyro_x_std", "gyro_y_std", "gyro_z_std"]:
138
+ if col in row and row[col] is not None:
139
+ new[col] = max(0.001, row[col] * rms_ratio * jitter(1, 0.1))
140
+
141
+ # --- frequency (weak positive correlation with RMS) ---
142
+ if "mean_freq_acc" in row and row["mean_freq_acc"] is not None and "rms_acc" in new and new["rms_acc"] is not None:
143
+ new["mean_freq_acc"] = round(jitter_abs(row["mean_freq_acc"], new["rms_acc"] * 0.3), 2)
144
+ elif "mean_freq_acc" in row and row["mean_freq_acc"] is not None:
145
+ new["mean_freq_acc"] = round(jitter(row["mean_freq_acc"], 0.02), 2)
146
+
147
+ if "mean_freq_gyro" in row and row["mean_freq_gyro"] is not None and "rms_gyro" in new and new["rms_gyro"] is not None:
148
+ new["mean_freq_gyro"] = round(jitter_abs(row["mean_freq_gyro"], new["rms_gyro"] * 0.3), 2)
149
+ elif "mean_freq_gyro" in row and row["mean_freq_gyro"] is not None:
150
+ new["mean_freq_gyro"] = round(jitter(row["mean_freq_gyro"], 0.02), 2)
151
+
152
+ # --- entropy: increases when RMS increases ---
153
+ if "entropy_acc" in row and row["entropy_acc"] is not None and "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0:
154
+ new["entropy_acc"] = min(1.0, max(0.05, row["entropy_acc"] * (new["rms_acc"] / row["rms_acc"]) * jitter(1, 0.1)))
155
+ elif "entropy_acc" in row and row["entropy_acc"] is not None:
156
+ new["entropy_acc"] = min(1.0, max(0.05, jitter(row["entropy_acc"], 0.02)))
157
+
158
+ if "entropy_gyro" in row and row["entropy_gyro"] is not None and "rms_gyro" in new and new["rms_gyro"] is not None and "rms_gyro" in row and row["rms_gyro"] is not None and row["rms_gyro"] > 0:
159
+ new["entropy_gyro"] = min(1.0, max(0.05, row["entropy_gyro"] * (new["rms_gyro"] / row["rms_gyro"]) * jitter(1, 0.1)))
160
+ elif "entropy_gyro" in row and row["entropy_gyro"] is not None:
161
+ new["entropy_gyro"] = min(1.0, max(0.05, jitter(row["entropy_gyro"], 0.02)))
162
+
163
+ # --- jerk: depends on std and RMS ---
164
+ if "jerk_mean" in row and row["jerk_mean"] is not None:
165
+ if "acc_x_std" in row and row["acc_x_std"] is not None:
166
+ new["jerk_mean"] = round(jitter_abs(row["jerk_mean"], row["acc_x_std"] * 0.3), 4)
 
 
 
 
167
  else:
168
+ new["jerk_mean"] = round(jitter(row["jerk_mean"], 0.02), 4)
169
 
170
+ if "jerk_std" in row and row["jerk_std"] is not None:
171
+ if "acc_x_std" in row and row["acc_x_std"] is not None:
172
+ new["jerk_std"] = max(0.001, round(jitter_abs(row["jerk_std"], row["acc_x_std"] * 0.1), 4))
 
 
 
 
173
  else:
174
+ new["jerk_std"] = max(0.001, round(jitter(row["jerk_std"], 0.01), 4))
175
+
176
+ # --- stability index (inverse to entropy) ---
177
+ entropy_avg = 0.5
178
+ if "entropy_acc" in new and new["entropy_acc"] is not None and "entropy_gyro" in new and new["entropy_gyro"] is not None:
179
+ entropy_avg = (new["entropy_acc"] + new["entropy_gyro"]) / 2
180
+ elif "entropy_acc" in new and new["entropy_acc"] is not None:
181
+ entropy_avg = new["entropy_acc"]
182
+ elif "entropy_gyro" in new and new["entropy_gyro"] is not None:
183
+ entropy_avg = new["entropy_gyro"]
184
+
185
+ new["stability_index"] = round(max(0.4, min(0.99, 1 - entropy_avg * 0.3)), 4)
186
+
187
+ # --- fatigue model (RMS, ์ฃผํŒŒ์ˆ˜ ๊ธฐ๋ฐ˜) ---
188
+ # fatigue๋Š” augment_user_data์—์„œ ์‹œ๊ฐ„์  ์—ฐ์†์„ฑ์„ ๊ณ ๋ คํ•˜์—ฌ ๊ณ„์‚ฐ
189
+ # ์—ฌ๊ธฐ์„œ๋Š” ๊ธฐ๋ณธ๊ฐ’๋งŒ ์„ค์ • (๋‚˜์ค‘์— ๋ฎ์–ด์”Œ์›Œ์ง)
190
+ if "fatigue" in row and row["fatigue"] is not None:
191
+ # ๊ธฐ๋ณธ์ ์œผ๋กœ RMS์™€ ์ฃผํŒŒ์ˆ˜ ๊ธฐ๋ฐ˜์œผ๋กœ ์•ฝ๊ฐ„ ์กฐ์ •
192
+ if "rms_acc" in new and new["rms_acc"] is not None and "rms_acc" in row and row["rms_acc"] is not None and row["rms_acc"] > 0.1:
193
+ rms_factor = new["rms_acc"] / row["rms_acc"]
194
+ else:
195
+ rms_factor = 1.0
196
 
197
+ if "mean_freq_acc" in new and new["mean_freq_acc"] is not None and "mean_freq_acc" in row and row["mean_freq_acc"] is not None and row["mean_freq_acc"] > 1:
198
+ freq_factor = row["mean_freq_acc"] / new["mean_freq_acc"]
199
+ else:
200
+ freq_factor = 1.0
201
+
202
+ fatigue_delta = rms_factor * 0.05 - freq_factor * 0.03
203
+ new["fatigue"] = min(0.95, max(0.05, row["fatigue"] + fatigue_delta + random.uniform(-0.02, 0.02)))
204
+ new["fatigue_level"] = 0 if new["fatigue"] < 0.3 else 1 if new["fatigue"] < 0.6 else 2
205
+ else:
206
+ # fatigue๊ฐ€ ์—†์œผ๋ฉด ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •
207
+ new["fatigue"] = 0.1
208
+ new["fatigue_level"] = 0
209
+
210
+ # fatigue_prev๋Š” augment_user_data์—์„œ ์„ค์ •๋จ
211
+ if "fatigue_prev" in row and row["fatigue_prev"] is not None:
212
+ new["fatigue_prev"] = row["fatigue_prev"]
213
+ else:
214
+ new["fatigue_prev"] = 0.05
215
+
216
+ # --- baseline values (preserve) ---
217
+ if "rms_base" in row:
218
+ new["rms_base"] = row["rms_base"]
219
+ if "freq_base" in row:
220
+ new["freq_base"] = row["freq_base"]
221
+
222
+ # --- user_emb: NEVER change ---
223
+ if "user_emb" in row:
224
+ new["user_emb"] = row["user_emb"]
225
+
226
+ # --- other fields ---
227
+ if "overlap_rate" in row and row["overlap_rate"] is not None:
228
+ new["overlap_rate"] = max(0.3, min(0.7, jitter(row["overlap_rate"], 0.02)))
229
+
230
+ if "window_size_ms" in row:
231
+ new["window_size_ms"] = row.get("window_size_ms", 2000)
232
+
233
+ if "quality_flag" in row:
234
  if random.random() < 0.05: # 5% ํ™•๋ฅ ๋กœ ๋ณ€๊ฒฝ
235
+ new["quality_flag"] = 0 if row["quality_flag"] == 1 else 1
236
+ else:
237
+ new["quality_flag"] = row["quality_flag"]
238
 
239
  # session_id ์•ฝ๊ฐ„ ๋ณ€ํ˜•
240
+ if "session_id" in row and row["session_id"]:
241
+ parts = str(row["session_id"]).split("_")
242
  if len(parts) > 1:
243
  try:
244
  session_num = int(parts[-1])
245
+ new["session_id"] = "_".join(parts[:-1]) + "_" + str(session_num + random.randint(-5, 5))
246
  except:
247
+ new["session_id"] = row["session_id"]
248
+ else:
249
+ new["session_id"] = row["session_id"]
250
 
251
+ return new
252
 
253
 
254
+ def augment_user_data(df: pd.DataFrame, target_count: int, new_user_id: str = None) -> pd.DataFrame:
255
+ """
256
+ ์‚ฌ์šฉ์ž๋ณ„ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆํญํ•˜์—ฌ ๋ชฉํ‘œ ๊ฐœ์ˆ˜๋งŒํผ ์ƒ์„ฑ
257
+ ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž์ธ ๊ฒฝ์šฐ ์‹œ๊ฐ„์  ์—ฐ์†์„ฑ์„ ์œ ์ง€
258
+ """
259
+ if len(df) >= target_count:
 
 
260
  return df.head(target_count)
261
 
262
+ need = target_count - len(df)
 
263
 
264
+ # ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž์ธ ๊ฒฝ์šฐ (๊ธฐ์กด ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๊ฑฐ๋‚˜ ์ƒˆ ์‚ฌ์šฉ์ž ID๊ฐ€ ์ œ๊ณต๋œ ๊ฒฝ์šฐ)
265
+ is_new_user = new_user_id is not None or len(df) == 0
 
 
 
 
 
 
 
 
 
266
 
267
+ if is_new_user and len(df) > 0:
268
+ # ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž๋Š” ํ•ญ์ƒ target_count๋งŒํผ ์ƒ์„ฑ (์ฐธ์กฐ ๋ฐ์ดํ„ฐ ๊ธธ์ด์™€ ๋ฌด๊ด€)
269
+ base_row = df.iloc[0].to_dict()
270
+ new_rows = []
271
+
272
+ # ์‹œ๊ฐ„ ๊ธฐ๋ฐ˜ ์ดˆ๊ธฐ๊ฐ’ ์„ค์ •
273
+ if "timestamp_utc" in base_row and base_row["timestamp_utc"]:
274
+ try:
275
+ base_time = datetime.fromisoformat(str(base_row["timestamp_utc"]).replace("Z", "+00:00"))
276
+ except:
277
+ base_time = datetime.now(timezone.utc)
278
+ else:
279
+ base_time = datetime.now(timezone.utc)
280
+
281
+ base_window_id = 1 # ์ƒˆ ์‚ฌ์šฉ์ž๋Š” window_id๋ฅผ 1๋ถ€ํ„ฐ ์‹œ์ž‘
282
+ base_window_start = 0 # ์ƒˆ ์‚ฌ์šฉ์ž๋Š” window_start_ms๋ฅผ 0๋ถ€ํ„ฐ ์‹œ์ž‘
283
+ prev_fatigue = base_row.get("fatigue", 0.1) if base_row.get("fatigue") is not None else 0.1
284
+
285
+ # ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž๋Š” ํ•ญ์ƒ target_count๋งŒํผ ์ƒ์„ฑ
286
+ for i in range(target_count):
287
+ # ์ƒ˜ํ”Œ ๋ ˆ์ฝ”๋“œ ์„ ํƒ
288
+ sample_idx = random.randint(0, len(df) - 1)
289
+ sample = df.iloc[sample_idx].to_dict()
290
+
291
+ # ์ƒˆ๋กœ์šด ๋ ˆ์ฝ”๋“œ ์ƒ์„ฑ
292
+ new_row = augment_record_strict(sample)
293
+
294
+ # ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ID ์„ค์ •
295
+ if new_user_id:
296
+ new_row["user_id"] = new_user_id
297
+
298
+ # ์‹œ๊ฐ„์  ์—ฐ์†์„ฑ ์œ ์ง€
299
+ window_interval = 2000 # window_size_ms
300
+ new_row["window_id"] = base_window_id + i
301
+ new_row["window_start_ms"] = base_window_start + i * window_interval
302
+ new_row["window_end_ms"] = new_row["window_start_ms"] + window_interval
303
+
304
+ # timestamp ์—ฐ์†์„ฑ ์œ ์ง€
305
+ new_row["timestamp_utc"] = (base_time + timedelta(milliseconds=i * window_interval)).isoformat()
306
+
307
+ # ํ”ผ๋กœ๋„ ์—ฐ์†์„ฑ ์œ ์ง€ (์ด์ „ ํ”ผ๋กœ๋„๋Š” ์ง์ „ ๋ ˆ์ฝ”๋“œ์˜ ํ”ผ๋กœ๋„)
308
+ if i > 0:
309
+ new_row["fatigue_prev"] = prev_fatigue
310
+ else:
311
+ # ์ฒซ ๋ ˆ์ฝ”๋“œ๋Š” ์ฐธ์กฐ ๋ฐ์ดํ„ฐ์˜ ํ”ผ๋กœ๋„์—์„œ ์•ฝ๊ฐ„ ๋‚ฎ๊ฒŒ ์‹œ์ž‘
312
+ new_row["fatigue_prev"] = max(0.05, prev_fatigue - random.uniform(0, 0.05))
313
+
314
+ # ํ˜„์žฌ ํ”ผ๋กœ๋„๋Š” ์ด์ „ ํ”ผ๋กœ๋„ ๊ธฐ๋ฐ˜์œผ๋กœ ์•ฝ๊ฐ„ ์ฆ๊ฐ€ํ•˜๋Š” ๊ฒฝํ–ฅ (์‹ค์ œ ์ธก์ •๊ณผ ์œ ์‚ฌ)
315
+ if "fatigue" in new_row and new_row["fatigue"] is not None:
316
+ # ํ”ผ๋กœ๋„๋Š” ์‹œ๊ฐ„์— ๋”ฐ๋ผ ์ ์ง„์ ์œผ๋กœ ์ฆ๊ฐ€ํ•˜๋Š” ๊ฒฝํ–ฅ
317
+ fatigue_base = new_row["fatigue_prev"] if "fatigue_prev" in new_row else prev_fatigue
318
+ # ์•ฝ๊ฐ„์˜ ์ฆ๊ฐ€ + ๋…ธ์ด์ฆˆ
319
+ fatigue_increase = random.uniform(0, 0.02) # ์‹œ๊ฐ„์— ๋”ฐ๋ฅธ ์ ์ง„์  ์ฆ๏ฟฝ๏ฟฝ
320
+ new_row["fatigue"] = min(0.95, max(0.05, fatigue_base + fatigue_increase + random.uniform(-0.01, 0.01)))
321
+ new_row["fatigue_level"] = 0 if new_row["fatigue"] < 0.3 else 1 if new_row["fatigue"] < 0.6 else 2
322
+ prev_fatigue = new_row["fatigue"]
323
+
324
+ # ์„ธ์…˜ ID ์ƒ์„ฑ (์ƒˆ ์‚ฌ์šฉ์ž์ด๋ฏ€๋กœ ์ƒˆ๋กœ์šด ์„ธ์…˜)
325
+ if "session_id" in new_row:
326
+ new_row["session_id"] = f"session_{i // 10 + 1:03d}" # 10๊ฐœ ๋ ˆ์ฝ”๋“œ๋‹น ์„ธ์…˜
327
+
328
+ # measure_date๋Š” ๊ธฐ์กด ๋ฐ์ดํ„ฐ์— ์žˆ๋Š” ๊ฒฝ์šฐ์—๋งŒ ์„ค์ •
329
+ if "measure_date" in sample:
330
+ try:
331
+ measure_time = datetime.fromisoformat(new_row["timestamp_utc"].replace("Z", "+00:00"))
332
+ new_row["measure_date"] = measure_time.strftime("%Y-%m-%d")
333
+ except:
334
+ new_row["measure_date"] = base_time.strftime("%Y-%m-%d")
335
+
336
+ new_rows.append(new_row)
337
+
338
+ return pd.DataFrame(new_rows)
339
 
340
+ else:
341
+ # ๊ธฐ์กด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์ฆํญ (์‹œ๊ฐ„์  ์—ฐ์†์„ฑ ์œ ์ง€)
342
+ new_rows = []
343
+ last_row = df.iloc[-1].to_dict()
344
+
345
+ # ๋งˆ์ง€๋ง‰ ๋ ˆ์ฝ”๋“œ์˜ ์‹œ๊ฐ„ ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ
346
+ if "timestamp_utc" in last_row and last_row["timestamp_utc"]:
347
+ try:
348
+ last_time = datetime.fromisoformat(str(last_row["timestamp_utc"]).replace("Z", "+00:00"))
349
+ except:
350
+ last_time = datetime.now(timezone.utc)
351
+ else:
352
+ last_time = datetime.now(timezone.utc)
353
+
354
+ last_window_id = last_row.get("window_id", 0) if last_row.get("window_id") is not None else 0
355
+ last_window_start = last_row.get("window_end_ms", 0) if last_row.get("window_end_ms") is not None else 0
356
+ prev_fatigue = last_row.get("fatigue", 0.1) if last_row.get("fatigue") is not None else 0.1
357
+
358
+ for i in range(need):
359
+ # ์ƒ˜ํ”Œ ๋ ˆ์ฝ”๋“œ ์„ ํƒ
360
+ sample_idx = random.randint(0, len(df) - 1)
361
+ sample = df.iloc[sample_idx].to_dict()
362
+
363
+ # ์ƒˆ๋กœ์šด ๋ ˆ์ฝ”๋“œ ์ƒ์„ฑ
364
+ new_row = augment_record_strict(sample)
365
+
366
+ # ์‹œ๊ฐ„์  ์—ฐ์†์„ฑ ์œ ์ง€
367
+ window_interval = 2000
368
+ new_row["window_id"] = last_window_id + i + 1
369
+ new_row["window_start_ms"] = last_window_start + i * window_interval
370
+ new_row["window_end_ms"] = new_row["window_start_ms"] + window_interval
371
+
372
+ # timestamp ์—ฐ์†์„ฑ ์œ ์ง€
373
+ new_row["timestamp_utc"] = (last_time + timedelta(milliseconds=(i + 1) * window_interval)).isoformat()
374
+
375
+ # ํ”ผ๋กœ๋„ ์—ฐ์†์„ฑ ์œ ์ง€
376
+ new_row["fatigue_prev"] = prev_fatigue
377
+ if "fatigue" in new_row and new_row["fatigue"] is not None:
378
+ # ํ”ผ๋กœ๋„๋Š” ์‹œ๊ฐ„์— ๋”ฐ๋ผ ์ ์ง„์ ์œผ๋กœ ์ฆ๊ฐ€ํ•˜๋Š” ๊ฒฝํ–ฅ
379
+ fatigue_increase = random.uniform(0, 0.02) # ์‹œ๊ฐ„์— ๋”ฐ๋ฅธ ์ ์ง„์  ์ฆ๊ฐ€
380
+ new_row["fatigue"] = min(0.95, max(0.05, prev_fatigue + fatigue_increase + random.uniform(-0.01, 0.01)))
381
+ new_row["fatigue_level"] = 0 if new_row["fatigue"] < 0.3 else 1 if new_row["fatigue"] < 0.6 else 2
382
+ prev_fatigue = new_row["fatigue"]
383
+
384
+ # measure_date๋Š” ๊ธฐ์กด ๋ฐ์ดํ„ฐ์— ์žˆ๋Š” ๊ฒฝ์šฐ์—๋งŒ ์„ค์ •
385
+ if "measure_date" in sample:
386
+ try:
387
+ measure_time = datetime.fromisoformat(new_row["timestamp_utc"].replace("Z", "+00:00"))
388
+ new_row["measure_date"] = measure_time.strftime("%Y-%m-%d")
389
+ except:
390
+ new_row["measure_date"] = last_time.strftime("%Y-%m-%d")
391
+
392
+ new_rows.append(new_row)
393
+
394
+ return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
395
 
396
 
397
  def main():
 
424
  else:
425
  user_id = filename_no_ext
426
 
427
+ # local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ํŒŒ์ผ์€ ์ œ์™ธ
428
+ if user_id.startswith("local_user"):
429
+ print(f"โญ๏ธ {user_id}: local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ํŒŒ์ผ์€ ์ œ์™ธ")
430
+ continue
431
+
432
  # ๊ฐœ๋ณ„ ํŒŒ์ผ์„ pandas๋กœ ์ง์ ‘ ๋กœ๋“œ
433
  from huggingface_hub import hf_hub_download
434
  import tempfile
 
455
  print(f"โŒ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ ์™„์ „ ์‹คํŒจ: {e3}")
456
  return
457
 
458
+ # ์œ ํšจํ•œ ์‚ฌ์šฉ์ž๋งŒ ํ•„ํ„ฐ๋ง (๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š” ์‚ฌ์šฉ์ž๋งŒ, local_user ์ œ์™ธ)
459
  valid_users = {}
460
  for user_id in existing.keys():
461
+ # local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ์‚ฌ์šฉ์ž๋Š” ์ œ์™ธ
462
+ if user_id.startswith("local_user"):
463
+ print(f"โญ๏ธ {user_id}: local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ์‚ฌ์šฉ์ž๋Š” ์ œ์™ธ")
464
+ continue
465
  try:
466
  user_data = existing[user_id]
467
  if len(user_data) > 0:
 
509
  continue
510
 
511
  try:
512
+ # ์ฐธ์กฐ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆํญํ•˜์—ฌ ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (์ƒˆ ์‚ฌ์šฉ์ž ID ์ „๋‹ฌ)
513
+ new_user_df = augment_user_data(reference_df, RECORDS_PER_USER, new_user_id=new_user_id)
514
+ # user_id ์ปฌ๋Ÿผ์ด ์—†์œผ๋ฉด ์ถ”๊ฐ€
515
+ if "user_id" not in new_user_df.columns:
516
+ new_user_df["user_id"] = new_user_id
517
+ else:
518
+ new_user_df["user_id"] = new_user_id
519
  new_user_datasets[new_user_id] = Dataset.from_pandas(new_user_df, preserve_index=False)
520
+ actual_count = len(new_user_df)
521
+ print(f"๐Ÿ“ˆ {new_user_id}: {actual_count} ๋ ˆ์ฝ”๋“œ ์ƒ์„ฑ (์ฐธ์กฐ: {reference_user_id}, ๋ชฉํ‘œ: {RECORDS_PER_USER})")
522
+ if actual_count != RECORDS_PER_USER:
523
+ print(f" โš ๏ธ ๊ฒฝ๊ณ : ์ƒ์„ฑ๋œ ๋ ˆ์ฝ”๋“œ ์ˆ˜({actual_count})๊ฐ€ ๋ชฉํ‘œ({RECORDS_PER_USER})์™€ ๋‹ค๋ฆ…๋‹ˆ๋‹ค!")
524
  except Exception as e:
525
  print(f"โŒ {new_user_id}: ์ƒ์„ฑ ์‹คํŒจ ({e}), ๊ฑด๋„ˆ๋œ€")
526
  continue
 
529
  print("โŒ ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ๊ฐ€ ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
530
  return
531
 
532
+ # ๊ธฐ์กด ๋ฐ์ดํ„ฐ์˜ ์Šคํ‚ค๋งˆ ํ™•์ธ (์ฒซ ๋ฒˆ์งธ ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ๊ธฐ์ค€)
533
+ print("๐Ÿ”ง ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์Šคํ‚ค๋งˆ ํ™•์ธ ์ค‘...")
534
+ reference_user_id = list(valid_users.keys())[0]
535
+ reference_df = valid_users[reference_user_id].to_pandas()
536
+ existing_columns = set(reference_df.columns)
537
+ print(f" ๐Ÿ“‹ ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์ปฌ๋Ÿผ ์ˆ˜: {len(existing_columns)}")
538
+ print(f" ๐Ÿ“‹ ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์ปฌ๋Ÿผ: {sorted(existing_columns)}")
539
+
540
+ # ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ์กด ์Šคํ‚ค๋งˆ์— ๋งž์ถค
541
+ print("๐Ÿ”ง ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ์กด ์Šคํ‚ค๋งˆ์— ๋งž์ถ”๋Š” ์ค‘...")
542
+ for user_id in new_user_datasets.keys():
543
+ df = new_user_datasets[user_id].to_pandas()
544
+
545
+ # ๊ธฐ์กด์— ์—†๋Š” ์ปฌ๋Ÿผ ์ œ๊ฑฐ
546
+ columns_to_remove = set(df.columns) - existing_columns
547
+ if columns_to_remove:
548
+ df = df.drop(columns=list(columns_to_remove))
549
+ print(f" โš ๏ธ {user_id}: ๋ถˆํ•„์š”ํ•œ ์ปฌ๋Ÿผ ์ œ๊ฑฐ: {columns_to_remove}")
550
+
551
+ # ๊ธฐ์กด์— ์žˆ๋Š”๋ฐ ์—†๋Š” ์ปฌ๋Ÿผ ์ถ”๊ฐ€ (None์œผ๋กœ)
552
+ columns_to_add = existing_columns - set(df.columns)
553
+ if columns_to_add:
554
+ for col in columns_to_add:
555
+ df[col] = None
556
+ print(f" โž• {user_id}: ๋ˆ„๋ฝ๋œ ์ปฌ๋Ÿผ ์ถ”๊ฐ€: {columns_to_add}")
557
+
558
+ # ์ปฌ๋Ÿผ ์ˆœ์„œ๋ฅผ ๊ธฐ์กด ๋ฐ์ดํ„ฐ์™€ ๋™์ผํ•˜๊ฒŒ ๋งž์ถค
559
+ df = df[list(reference_df.columns)]
560
+
561
+ new_user_datasets[user_id] = Dataset.from_pandas(df, preserve_index=False)
562
+ print(f" โœ… {user_id}: ์Šคํ‚ค๋งˆ ์ •๊ทœํ™” ์™„๋ฃŒ")
563
+
564
  # ๊ธฐ์กด ๋ฐ์ดํ„ฐ์…‹์— ์ƒˆ๋กœ์šด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
565
  final_datasets = {}
566
  # ๊ธฐ์กด ์‚ฌ์šฉ์ž ๋ฐ์ดํ„ฐ ์œ ์ง€
 
577
  print(f"๐Ÿ“Š ์ „์ฒด ๋ฐ์ดํ„ฐ์…‹ ์ด ๋ ˆ์ฝ”๋“œ ์ˆ˜: {total_records}")
578
  print(f"๐Ÿ“Š ์ƒˆ๋กœ์šด parquet ํŒŒ์ผ ์ˆ˜: {len(new_user_datasets)}๊ฐœ")
579
 
580
+ # local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ํŒŒ์ผ ์‚ญ์ œ
581
+ print("๐Ÿ—‘๏ธ local_user๋กœ ์‹œ์ž‘ํ•˜๋Š” ํŒŒ์ผ ์‚ญ์ œ ์ค‘...")
582
+ try:
583
+ files_to_delete = []
584
+ for file_path in parquet_files:
585
+ filename = file_path.split("/")[-1] if "/" in file_path else file_path
586
+ filename_no_ext = filename.replace(".parquet", "")
587
+ # -00000-of-00001 ๋ถ€๋ถ„์ด ์žˆ์œผ๋ฉด ์ œ๊ฑฐ
588
+ if "-" in filename_no_ext:
589
+ user_id = filename_no_ext.split("-")[0]
590
+ else:
591
+ user_id = filename_no_ext
592
+
593
+ if user_id.startswith("local_user"):
594
+ files_to_delete.append(file_path)
595
+
596
+ for file_path in files_to_delete:
597
+ try:
598
+ api.delete_file(path_in_repo=file_path, repo_id=repo_id, repo_type="dataset", token=token)
599
+ print(f" โœ… ์‚ญ์ œ: {file_path}")
600
+ except Exception as e:
601
+ print(f" โš ๏ธ ์‚ญ์ œ ์‹คํŒจ ({file_path}): {str(e)[:100]}")
602
+
603
+ if files_to_delete:
604
+ print(f"๐Ÿ—‘๏ธ {len(files_to_delete)}๊ฐœ ํŒŒ์ผ ์‚ญ์ œ ์™„๋ฃŒ")
605
+ else:
606
+ print("โ„น๏ธ ์‚ญ์ œํ•  local_user ํŒŒ์ผ์ด ์—†์Šต๋‹ˆ๋‹ค")
607
+ except Exception as e:
608
+ print(f"โš ๏ธ ํŒŒ์ผ ์‚ญ์ œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)[:100]}")
609
+
610
  print(f"๐Ÿ“ค Hugging Face Hub์— ์—…๋กœ๋“œ ์ค‘: {repo_id}")
611
  final_dict.push_to_hub(repo_id, token=token, private=True)
612
  print("โœ… ์—…๋กœ๋“œ ์™„๋ฃŒ")