Spaces:
Sleeping
Sleeping
fix normalizing windows
Browse files
app.py
CHANGED
|
@@ -250,6 +250,20 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
|
|
| 250 |
existing = load_dataset(hf_repo_id, token=hf_token)
|
| 251 |
all_splits = list(existing.keys())
|
| 252 |
print(f"๐ ๊ธฐ์กด splits: {all_splits}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
except Exception:
|
| 254 |
existing = DatasetDict()
|
| 255 |
print("๐ ๊ธฐ์กด repo ์์ โ ์๋ก ์์ฑ")
|
|
@@ -279,11 +293,12 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
|
|
| 279 |
if user_id in existing:
|
| 280 |
# ๊ธฐ์กด ๋ฐ์ดํฐ ์ ๊ทํ ๋ฐ ๋ณํฉ
|
| 281 |
old_df = existing[user_id].to_pandas()
|
|
|
|
| 282 |
old_df["windows"] = old_df["windows"].apply(
|
| 283 |
-
lambda w: [str(v) for v in w] if isinstance(w, list) else []
|
| 284 |
)
|
| 285 |
merged_df = pd.concat([old_df, new_df], ignore_index=True)
|
| 286 |
-
existing[user_id] =
|
| 287 |
print(f"๐ {user_id}: ๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ณํฉ ({len(old_df)} + {len(new_df)} = {len(merged_df)}๊ฐ ๋ ์ฝ๋)")
|
| 288 |
else:
|
| 289 |
existing[user_id] = new_dataset
|
|
@@ -326,35 +341,36 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
|
|
| 326 |
raise HTTPException(status_code=500, detail=f"๋ฐฐ์น ํธ์ ์คํจ: {str(e)}")
|
| 327 |
|
| 328 |
def normalize_windows(record):
|
| 329 |
-
w = record.get("windows")
|
| 330 |
result = []
|
| 331 |
|
| 332 |
if isinstance(w, list):
|
| 333 |
for item in w:
|
| 334 |
if isinstance(item, dict):
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
| 337 |
result.append(str(item))
|
| 338 |
elif isinstance(w, dict):
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
return record
|
| 343 |
|
| 344 |
def df_to_dataset(df):
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
("mode", pa.string()),
|
| 353 |
-
("window_count", pa.int64()),
|
| 354 |
-
("windows", pa.list_(pa.string())),
|
| 355 |
-
("measurement_count", pa.int64()),
|
| 356 |
-
("batch_date", pa.string()),
|
| 357 |
-
("batch_size", pa.int64()),
|
| 358 |
-
("timestamp", pa.string())
|
| 359 |
-
])
|
| 360 |
-
return Dataset.from_pandas(df, schema=schema)
|
|
|
|
| 250 |
existing = load_dataset(hf_repo_id, token=hf_token)
|
| 251 |
all_splits = list(existing.keys())
|
| 252 |
print(f"๐ ๊ธฐ์กด splits: {all_splits}")
|
| 253 |
+
|
| 254 |
+
# ๊ธฐ์กด ๋ฐ์ดํฐ๋ฅผ ์์ ํ ์๋ก ์์ฑ (์คํค๋ง ํต์ผ)
|
| 255 |
+
new_existing = DatasetDict()
|
| 256 |
+
for user_id in existing.keys():
|
| 257 |
+
df = existing[user_id].to_pandas()
|
| 258 |
+
# windows ํ๋๋ฅผ ๋ฌธ์์ด ๋ฆฌ์คํธ๋ก ๊ฐ์ ๋ณํ
|
| 259 |
+
df["windows"] = df["windows"].apply(
|
| 260 |
+
lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
|
| 261 |
+
)
|
| 262 |
+
# ๋ชจ๋ ๋ฐ์ดํฐ๋ฅผ ์๋ก ์์ฑํ์ฌ ์คํค๋ง ํต์ผ
|
| 263 |
+
new_existing[user_id] = df_to_dataset(df)
|
| 264 |
+
print(f"๐ง {user_id}: ๊ธฐ์กด ๋ฐ์ดํฐ ์ฌ์์ฑ ์๋ฃ")
|
| 265 |
+
existing = new_existing
|
| 266 |
+
|
| 267 |
except Exception:
|
| 268 |
existing = DatasetDict()
|
| 269 |
print("๐ ๊ธฐ์กด repo ์์ โ ์๋ก ์์ฑ")
|
|
|
|
| 293 |
if user_id in existing:
|
| 294 |
# ๊ธฐ์กด ๋ฐ์ดํฐ ์ ๊ทํ ๋ฐ ๋ณํฉ
|
| 295 |
old_df = existing[user_id].to_pandas()
|
| 296 |
+
# ๊ธฐ์กด windows ๋ฐ์ดํฐ๋ฅผ ๋ฌธ์์ด ๋ฆฌ์คํธ๋ก ์ ๊ทํ
|
| 297 |
old_df["windows"] = old_df["windows"].apply(
|
| 298 |
+
lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
|
| 299 |
)
|
| 300 |
merged_df = pd.concat([old_df, new_df], ignore_index=True)
|
| 301 |
+
existing[user_id] = df_to_dataset(merged_df)
|
| 302 |
print(f"๐ {user_id}: ๊ธฐ์กด ๋ฐ์ดํฐ์ ๋ณํฉ ({len(old_df)} + {len(new_df)} = {len(merged_df)}๊ฐ ๋ ์ฝ๋)")
|
| 303 |
else:
|
| 304 |
existing[user_id] = new_dataset
|
|
|
|
| 341 |
raise HTTPException(status_code=500, detail=f"๋ฐฐ์น ํธ์ ์คํจ: {str(e)}")
|
| 342 |
|
| 343 |
def normalize_windows(record):
|
| 344 |
+
w = record.get("windows", [])
|
| 345 |
result = []
|
| 346 |
|
| 347 |
if isinstance(w, list):
|
| 348 |
for item in w:
|
| 349 |
if isinstance(item, dict):
|
| 350 |
+
# ๋์
๋๋ฆฌ์ ๋ชจ๋ ๊ฐ๋ค์ ๋ฌธ์์ด๋ก ๋ณํ
|
| 351 |
+
for v in item.values():
|
| 352 |
+
if v is not None and str(v).strip():
|
| 353 |
+
result.append(str(v))
|
| 354 |
+
elif item is not None and str(item).strip():
|
| 355 |
result.append(str(item))
|
| 356 |
elif isinstance(w, dict):
|
| 357 |
+
# ๋์
๋๋ฆฌ์ ๋ชจ๋ ๊ฐ๋ค์ ๋ฌธ์์ด๋ก ๋ณํ
|
| 358 |
+
for v in w.values():
|
| 359 |
+
if v is not None and str(v).strip():
|
| 360 |
+
result.append(str(v))
|
| 361 |
+
else:
|
| 362 |
+
# windows๊ฐ ์๊ฑฐ๋ ๋ค๋ฅธ ํ์
์ธ ๊ฒฝ์ฐ ๋น ๋ฆฌ์คํธ
|
| 363 |
+
result = []
|
| 364 |
+
|
| 365 |
+
record["windows"] = result
|
| 366 |
+
print(f"๐ Windows ์ ๊ทํ: {w} โ {result}")
|
| 367 |
return record
|
| 368 |
|
| 369 |
def df_to_dataset(df):
|
| 370 |
+
"""DataFrame์ Dataset์ผ๋ก ๋ณํ (windows ํ๋ ์ ๊ทํ)"""
|
| 371 |
+
# windows ํ๋๊ฐ ๋ฆฌ์คํธ์ธ์ง ํ์ธํ๊ณ ์ ๊ทํ
|
| 372 |
+
if 'windows' in df.columns:
|
| 373 |
+
df['windows'] = df['windows'].apply(
|
| 374 |
+
lambda x: x if isinstance(x, list) else []
|
| 375 |
+
)
|
| 376 |
+
return Dataset.from_pandas(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|