Spaces:

Merry99
/

MuscleCare-FastAPI

Sleeping

App Files Files Community

Merry99 commited on Oct 30, 2025

Commit

3c70e7c

1 Parent(s): b545869

add batch update logs

Browse files

Files changed (1) hide show

app.py +137 -0

app.py CHANGED Viewed

@@ -66,6 +66,23 @@ class LogUploadPayload(BaseModel):
     window_count: int
     measurement_count: int
 # ----- 유틸 -----
 def clob_json(obj) -> str:
@@ -233,6 +250,126 @@ async def upload_logs(payload: LogUploadPayload):
         raise HTTPException(status_code=500, detail=f"로그 업로드 실패: {str(e)}")
 def df_to_dataset(df):
     """DataFrame을 Dataset으로 변환"""
     return Dataset.from_pandas(df)

     window_count: int
     measurement_count: int
+class BatchLogItem(BaseModel):
+    user_id: str
+    session_id: str
+    measure_date: str
+    rms: float
+    freq: float
+    fatigue: float
+    rms_base: Optional[float] = None
+    freq_base: Optional[float] = None
+    user_emb: Optional[List[float]] = Field(default=None, description="length=12")
+    mode: str
+    window_count: int
+    measurement_count: int
+class BatchLogsPayload(BaseModel):
+    batch_data: List[BatchLogItem]
 # ----- 유틸 -----
 def clob_json(obj) -> str:
         raise HTTPException(status_code=500, detail=f"로그 업로드 실패: {str(e)}")
+@app.post("/upload_batch_logs")
+async def upload_batch_logs(payload: BatchLogsPayload):
+    """배치 로그 데이터를 Hugging Face Hub에 병렬 아닌 일괄 반영 (스키마 정규화 포함)"""
+    try:
+        hf_repo_id = os.getenv("HF_DATA_REPO_ID")
+        hf_token = os.getenv("HF_DATA_TOKEN")
+        if not hf_repo_id or not hf_token:
+            raise HTTPException(status_code=500, detail="Hugging Face 설정이 필요합니다 (HF_DATA_REPO_ID, HF_DATA_TOKEN)")
+        # 새 스키마 정의
+        target_cols = [
+            "session_id", "measure_date", "rms", "freq", "fatigue",
+            "rms_base", "freq_base", "user_emb", "mode", "window_count",
+            "measurement_count", "timestamp"
+        ]
+        # 기존 데이터 로드
+        try:
+            existing = load_dataset(hf_repo_id, token=hf_token)
+            print("📂 기존 DatasetDict 로드 완료")
+        except Exception:
+            existing = DatasetDict()
+            print("📂 기존 repo 없음 → 새로 생성")
+        # 기존 스키마 정규화: 불필요 컬럼 제거, 누락 컬럼 추가
+        def normalize_existing_df(df: pd.DataFrame) -> pd.DataFrame:
+            # user_emb가 문자열인 경우 파싱 시도
+            if "user_emb" in df.columns:
+                def _parse_emb(x):
+                    if isinstance(x, list):
+                        return x
+                    if isinstance(x, str):
+                        try:
+                            import json as _json
+                            v = _json.loads(x)
+                            return v if isinstance(v, list) else []
+                        except Exception:
+                            return []
+                    return []
+                df["user_emb"] = df["user_emb"].apply(_parse_emb)
+            # 타임스탬프 없으면 추가
+            if "timestamp" not in df.columns:
+                df["timestamp"] = datetime.now().isoformat()
+            # 타겟 컬럼 세트로 맞추기
+            for c in target_cols:
+                if c not in df.columns:
+                    df[c] = None
+            # 여분 컬럼 제거
+            df = df[target_cols]
+            return df
+        # payload를 사용자별로 그룹화
+        user_groups: dict[str, list[dict]] = {}
+        for item in payload.batch_data:
+            # 레코드 생성
+            rec = {
+                "session_id": item.session_id,
+                "measure_date": item.measure_date,
+                "rms": item.rms,
+                "freq": item.freq,
+                "fatigue": item.fatigue,
+                "rms_base": item.rms_base,
+                "freq_base": item.freq_base,
+                "user_emb": item.user_emb,
+                "mode": item.mode,
+                "window_count": item.window_count,
+                "measurement_count": item.measurement_count,
+                "timestamp": datetime.now().isoformat()
+            }
+            user_groups.setdefault(item.user_id, []).append(rec)
+        results = {}
+        # 사용자별 병합 처리
+        for user_id, records in user_groups.items():
+            try:
+                new_df = pd.DataFrame(records)
+                # 새 DF도 타겟 스키마로 보정
+                for c in target_cols:
+                    if c not in new_df.columns:
+                        new_df[c] = None
+                new_df = new_df[target_cols]
+                if user_id in existing:
+                    old_df = existing[user_id].to_pandas()
+                    old_df = normalize_existing_df(old_df)
+                    merged = pd.concat([old_df, new_df], ignore_index=True)
+                    existing[user_id] = df_to_dataset(merged)
+                    print(f"📊 {user_id}: 병합 ({len(old_df)} + {len(new_df)} = {len(merged)})")
+                else:
+                    existing[user_id] = df_to_dataset(new_df)
+                    print(f"📊 {user_id}: 신규 생성 ({len(new_df)})")
+                results[user_id] = {"status": "success", "new_rows": len(records)}
+            except Exception as e:
+                print(f"❌ {user_id} 처리 실패: {e}")
+                results[user_id] = {"status": "failed", "error": str(e)}
+        # ��시
+        try:
+            existing.push_to_hub(hf_repo_id, token=hf_token, private=True)
+            print(f"✅ DatasetDict 푸시 완료: {len(existing)} users")
+        except Exception as e:
+            print(f"❌ 전체 푸시 실패: {e}")
+            raise HTTPException(status_code=500, detail=f"전체 푸시 실패: {str(e)}")
+        return {
+            "processed_users": len(user_groups),
+            "total_rows": sum(len(v) for v in user_groups.values()),
+            "results": results,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"❌ 배치 로그 업로드 실패: {e}")
+        raise HTTPException(status_code=500, detail=f"배치 로그 업로드 실패: {str(e)}")
 def df_to_dataset(df):
     """DataFrame을 Dataset으로 변환"""
     return Dataset.from_pandas(df)