Spaces:

Merry99
/

MuscleCare-FastAPI

Sleeping

App Files Files Community

Merry99 commited on Oct 23, 2025

Commit

ead1bf0

1 Parent(s): 5726b96

fix normalizing windows

Browse files

Files changed (1) hide show

app.py +40 -24

app.py CHANGED Viewed

@@ -250,6 +250,20 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
             existing = load_dataset(hf_repo_id, token=hf_token)
             all_splits = list(existing.keys())
             print(f"📂 기존 splits: {all_splits}")
         except Exception:
             existing = DatasetDict()
             print("📂 기존 repo 없음 → 새로 생성")
@@ -279,11 +293,12 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
                 if user_id in existing:
                     # 기존 데이터 정규화 및 병합
                     old_df = existing[user_id].to_pandas()
                     old_df["windows"] = old_df["windows"].apply(
-                        lambda w: [str(v) for v in w] if isinstance(w, list) else []
                     )
                     merged_df = pd.concat([old_df, new_df], ignore_index=True)
-                    existing[user_id] = Dataset.from_pandas(merged_df)
                     print(f"📊 {user_id}: 기존 데이터와 병합 ({len(old_df)} + {len(new_df)} = {len(merged_df)}개 레코드)")
                 else:
                     existing[user_id] = new_dataset
@@ -326,35 +341,36 @@ async def upload_batch_dataset(payload: BatchUploadPayload):
         raise HTTPException(status_code=500, detail=f"배치 푸시 실패: {str(e)}")
 def normalize_windows(record):
-    w = record.get("windows")
     result = []
     if isinstance(w, list):
         for item in w:
             if isinstance(item, dict):
-                result.extend([str(v) for v in item.values() if v is not None])
-            elif item is not None:
                 result.append(str(item))
     elif isinstance(w, dict):
-        result = [str(v) for v in w.values() if v is not None]
-    record["windows"] = [x for x in result if x not in ("", "None")]
     return record
 def df_to_dataset(df):
-    import pyarrow as pa
-    schema = pa.schema([
-        ("session_id", pa.string()),
-        ("measure_date", pa.string()),
-        ("rms", pa.float64()),
-        ("freq", pa.float64()),
-        ("fatigue", pa.float64()),
-        ("mode", pa.string()),
-        ("window_count", pa.int64()),
-        ("windows", pa.list_(pa.string())),
-        ("measurement_count", pa.int64()),
-        ("batch_date", pa.string()),
-        ("batch_size", pa.int64()),
-        ("timestamp", pa.string())
-    ])
-    return Dataset.from_pandas(df, schema=schema)

             existing = load_dataset(hf_repo_id, token=hf_token)
             all_splits = list(existing.keys())
             print(f"📂 기존 splits: {all_splits}")
+            # 기존 데이터를 완전히 새로 생성 (스키마 통일)
+            new_existing = DatasetDict()
+            for user_id in existing.keys():
+                df = existing[user_id].to_pandas()
+                # windows 필드를 문자열 리스트로 강제 변환
+                df["windows"] = df["windows"].apply(
+                    lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
+                )
+                # 모든 데이터를 새로 생성하여 스키마 통일
+                new_existing[user_id] = df_to_dataset(df)
+                print(f"🔧 {user_id}: 기존 데이터 재생성 완료")
+            existing = new_existing
         except Exception:
             existing = DatasetDict()
             print("📂 기존 repo 없음 → 새로 생성")
                 if user_id in existing:
                     # 기존 데이터 정규화 및 병합
                     old_df = existing[user_id].to_pandas()
+                    # 기존 windows 데이터를 문자열 리스트로 정규화
                     old_df["windows"] = old_df["windows"].apply(
+                        lambda w: [str(v) for v in w] if isinstance(w, list) and len(w) > 0 else []
                     )
                     merged_df = pd.concat([old_df, new_df], ignore_index=True)
+                    existing[user_id] = df_to_dataset(merged_df)
                     print(f"📊 {user_id}: 기존 데이터와 병합 ({len(old_df)} + {len(new_df)} = {len(merged_df)}개 레코드)")
                 else:
                     existing[user_id] = new_dataset
         raise HTTPException(status_code=500, detail=f"배치 푸시 실패: {str(e)}")
 def normalize_windows(record):
+    w = record.get("windows", [])
     result = []
     if isinstance(w, list):
         for item in w:
             if isinstance(item, dict):
+                # 딕셔너리의 모든 값들을 문자열로 변환
+                for v in item.values():
+                    if v is not None and str(v).strip():
+                        result.append(str(v))
+            elif item is not None and str(item).strip():
                 result.append(str(item))
     elif isinstance(w, dict):
+        # 딕셔너리의 모든 값들을 문자열로 변환
+        for v in w.values():
+            if v is not None and str(v).strip():
+                result.append(str(v))
+    else:
+        # windows가 없거나 다른 타입인 경우 빈 리스트
+        result = []
+    record["windows"] = result
+    print(f"🔍 Windows 정규화: {w} → {result}")
     return record
 def df_to_dataset(df):
+    """DataFrame을 Dataset으로 변환 (windows 필드 정규화)"""
+    # windows 필드가 리스트인지 확인하고 정규화
+    if 'windows' in df.columns:
+        df['windows'] = df['windows'].apply(
+            lambda x: x if isinstance(x, list) else []
+        )
+    return Dataset.from_pandas(df)