Merry99 commited on
Commit
ead1bf0
ยท
1 Parent(s): 5726b96

fix normalizing windows

Browse files
Files changed (1) hide show
  1. app.py +40 -24
app.py CHANGED
@@ -250,6 +250,20 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
250
  existing = load_dataset(hf_repo_id, token=hf_token)
251
  all_splits = list(existing.keys())
252
  print(f"๐Ÿ“‚ ๊ธฐ์กด splits: {all_splits}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  except Exception:
254
  existing = DatasetDict()
255
  print("๐Ÿ“‚ ๊ธฐ์กด repo ์—†์Œ โ†’ ์ƒˆ๋กœ ์ƒ์„ฑ")
@@ -279,11 +293,12 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
279
  if user_id in existing:
280
  # ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์ •๊ทœํ™” ๋ฐ ๋ณ‘ํ•ฉ
281
  old_df = existing[user_id].to_pandas()
 
282
  old_df["windows"] = old_df["windows"].apply(
283
- lambda w: [str(v) for v in w] if isinstance(w, list) else []
284
  )
285
  merged_df = pd.concat([old_df, new_df], ignore_index=True)
286
- existing[user_id] = Dataset.from_pandas(merged_df)
287
  print(f"๐Ÿ“Š {user_id}: ๊ธฐ์กด ๋ฐ์ดํ„ฐ์™€ ๋ณ‘ํ•ฉ ({len(old_df)} + {len(new_df)} = {len(merged_df)}๊ฐœ ๋ ˆ์ฝ”๋“œ)")
288
  else:
289
  existing[user_id] = new_dataset
@@ -326,35 +341,36 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
326
  raise HTTPException(status_code=500, detail=f"๋ฐฐ์น˜ ํ‘ธ์‹œ ์‹คํŒจ: {str(e)}")
327
 
328
  def normalize_windows(record):
329
- w = record.get("windows")
330
  result = []
331
 
332
  if isinstance(w, list):
333
  for item in w:
334
  if isinstance(item, dict):
335
- result.extend([str(v) for v in item.values() if v is not None])
336
- elif item is not None:
 
 
 
337
  result.append(str(item))
338
  elif isinstance(w, dict):
339
- result = [str(v) for v in w.values() if v is not None]
340
-
341
- record["windows"] = [x for x in result if x not in ("", "None")]
 
 
 
 
 
 
 
342
  return record
343
 
344
  def df_to_dataset(df):
345
- import pyarrow as pa
346
- schema = pa.schema([
347
- ("session_id", pa.string()),
348
- ("measure_date", pa.string()),
349
- ("rms", pa.float64()),
350
- ("freq", pa.float64()),
351
- ("fatigue", pa.float64()),
352
- ("mode", pa.string()),
353
- ("window_count", pa.int64()),
354
- ("windows", pa.list_(pa.string())),
355
- ("measurement_count", pa.int64()),
356
- ("batch_date", pa.string()),
357
- ("batch_size", pa.int64()),
358
- ("timestamp", pa.string())
359
- ])
360
- return Dataset.from_pandas(df, schema=schema)
 
250
  existing = load_dataset(hf_repo_id, token=hf_token)
251
  all_splits = list(existing.keys())
252
  print(f"๐Ÿ“‚ ๊ธฐ์กด splits: {all_splits}")
253
+
254
+ # ๊ธฐ์กด ๋ฐ์ดํ„ฐ๋ฅผ ์™„์ „ํžˆ ์ƒˆ๋กœ ์ƒ์„ฑ (์Šคํ‚ค๋งˆ ํ†ต์ผ)
255
+ new_existing = DatasetDict()
256
+ for user_id in existing.keys():
257
+ df = existing[user_id].to_pandas()
258
+ # windows ํ•„๋“œ๋ฅผ ๋ฌธ์ž์—ด ๋ฆฌ์ŠคํŠธ๋กœ ๊ฐ•์ œ ๋ณ€ํ™˜
259
+ df["windows"] = df["windows"].apply(
260
+ lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
261
+ )
262
+ # ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋ฅผ ์ƒˆ๋กœ ์ƒ์„ฑํ•˜์—ฌ ์Šคํ‚ค๋งˆ ํ†ต์ผ
263
+ new_existing[user_id] = df_to_dataset(df)
264
+ print(f"๐Ÿ”ง {user_id}: ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์žฌ์ƒ์„ฑ ์™„๋ฃŒ")
265
+ existing = new_existing
266
+
267
  except Exception:
268
  existing = DatasetDict()
269
  print("๐Ÿ“‚ ๊ธฐ์กด repo ์—†์Œ โ†’ ์ƒˆ๋กœ ์ƒ์„ฑ")
 
293
  if user_id in existing:
294
  # ๊ธฐ์กด ๋ฐ์ดํ„ฐ ์ •๊ทœํ™” ๋ฐ ๋ณ‘ํ•ฉ
295
  old_df = existing[user_id].to_pandas()
296
+ # ๊ธฐ์กด windows ๋ฐ์ดํ„ฐ๋ฅผ ๋ฌธ์ž์—ด ๋ฆฌ์ŠคํŠธ๋กœ ์ •๊ทœํ™”
297
  old_df["windows"] = old_df["windows"].apply(
298
+ lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
299
  )
300
  merged_df = pd.concat([old_df, new_df], ignore_index=True)
301
+ existing[user_id] = df_to_dataset(merged_df)
302
  print(f"๐Ÿ“Š {user_id}: ๊ธฐ์กด ๋ฐ์ดํ„ฐ์™€ ๋ณ‘ํ•ฉ ({len(old_df)} + {len(new_df)} = {len(merged_df)}๊ฐœ ๋ ˆ์ฝ”๋“œ)")
303
  else:
304
  existing[user_id] = new_dataset
 
341
  raise HTTPException(status_code=500, detail=f"๋ฐฐ์น˜ ํ‘ธ์‹œ ์‹คํŒจ: {str(e)}")
342
 
343
  def normalize_windows(record):
344
+ w = record.get("windows", [])
345
  result = []
346
 
347
  if isinstance(w, list):
348
  for item in w:
349
  if isinstance(item, dict):
350
+ # ๋”•์…”๋„ˆ๋ฆฌ์˜ ๋ชจ๋“  ๊ฐ’๋“ค์„ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
351
+ for v in item.values():
352
+ if v is not None and str(v).strip():
353
+ result.append(str(v))
354
+ elif item is not None and str(item).strip():
355
  result.append(str(item))
356
  elif isinstance(w, dict):
357
+ # ๋”•์…”๋„ˆ๋ฆฌ์˜ ๋ชจ๋“  ๊ฐ’๋“ค์„ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
358
+ for v in w.values():
359
+ if v is not None and str(v).strip():
360
+ result.append(str(v))
361
+ else:
362
+ # windows๊ฐ€ ์—†๊ฑฐ๋‚˜ ๋‹ค๋ฅธ ํƒ€์ž…์ธ ๊ฒฝ์šฐ ๋นˆ ๋ฆฌ์ŠคํŠธ
363
+ result = []
364
+
365
+ record["windows"] = result
366
+ print(f"๐Ÿ” Windows ์ •๊ทœํ™”: {w} โ†’ {result}")
367
  return record
368
 
369
  def df_to_dataset(df):
370
+ """DataFrame์„ Dataset์œผ๋กœ ๋ณ€ํ™˜ (windows ํ•„๋“œ ์ •๊ทœํ™”)"""
371
+ # windows ํ•„๋“œ๊ฐ€ ๋ฆฌ์ŠคํŠธ์ธ์ง€ ํ™•์ธํ•˜๊ณ  ์ •๊ทœํ™”
372
+ if 'windows' in df.columns:
373
+ df['windows'] = df['windows'].apply(
374
+ lambda x: x if isinstance(x, list) else []
375
+ )
376
+ return Dataset.from_pandas(df)