Spaces:

Merry99
/

MuscleCare-Train-Hybrid

Sleeping

App Files Files Community

MuscleCare-Train-Hybrid / app.py

Merry99

add method on /health

b18773a 3 months ago

raw

history blame contribute delete

18.4 kB

	"""FastAPI 앱: 수동 학습 및 모델 다운로드/업로드"""

	from __future__ import annotations

	import os
	import threading
	import time
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	import schedule
	import lightgbm as lgb
	import numpy as np
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import FileResponse
	from huggingface_hub import HfApi
	from pydantic import BaseModel, field_validator

	from train_scheduler import TrainingScheduler


	app = FastAPI(
	title="MuscleCare LightGBM Scheduler",
	description="MuscleCare-Train-AI Space와 동일한 API를 LightGBM 모델로 제공합니다.",
	)

	_scheduler = TrainingScheduler()

	_model_lock = threading.Lock()
	_current_model: Optional[lgb.Booster] = None
	_current_model_path: Optional[str] = None
	_current_model_version: Optional[int] = None
	_model_cache_timestamp: Optional[float] = None
	MODEL_CACHE_TIMEOUT = 3600 # 1시간


	class TrainResponse(BaseModel):
	status: str
	new_data_count: int
	model_path: Optional[str] = None
	hub_url: Optional[str] = None
	model_version: Optional[int] = None
	message: str
	new_session_count: Optional[int] = None


	class ResetStateResponse(BaseModel):
	status: str
	state: Dict[str, Any]


	class PredictRequest(BaseModel):
	rms_acc: float
	rms_gyro: float
	mean_freq_acc: float
	mean_freq_gyro: float
	rms_base: float
	freq_base: float
	user_emb: List[float]

	@field_validator("user_emb")
	@classmethod
	def validate_user_emb(cls, v: List[float]) -> List[float]:
	if len(v) != 12:
	raise ValueError("user_emb must contain exactly 12 values.")
	return v


	class PredictResponse(BaseModel):
	fatigue: float
	model_version: Optional[int]


	def _schedule_background_job() -> None:
	schedule.clear()
	schedule.every().sunday.at(_scheduler.schedule_time).do(_scheduler.run_scheduled_training)

	def _loop() -> None:
	while True:
	schedule.run_pending()
	time.sleep(60)

	threading.Thread(target=_loop, daemon=True).start()


	def _apply_training_result(result: Dict[str, Any]) -> None:
	if result.get("status") != "trained":
	return
	model_path = result.get("model_path")
	if not model_path:
	print("[Model] 학습 결과에 model_path가 없어 모델을 로드하지 못했습니다.")
	return
	try:
	_load_model_from_path(Path(model_path), result.get("model_version"))
	except Exception as exc:
	print(f"[Model] 새 모델 로드 실패: {exc}")


	def _load_model_from_path(path: Path, version: Optional[int] = None) -> None:
	if not path.exists():
	raise FileNotFoundError(f"모델 파일을 찾을 수 없습니다: {path}")
	booster = lgb.Booster(model_file=str(path))
	with _model_lock:
	global _current_model, _current_model_path, _current_model_version, _model_cache_timestamp
	_current_model = booster
	_current_model_path = str(path)
	_current_model_version = version
	_model_cache_timestamp = time.time()
	print(f"[Model] Loaded LightGBM model from {path} (version={version})")


	def _get_cached_model() -> Optional[lgb.Booster]:
	"""캐시된 모델 반환, 타임아웃 시 None 반환"""
	global _current_model, _model_cache_timestamp
	with _model_lock:
	if _current_model is None:
	return None
	if _model_cache_timestamp is None:
	return None
	if time.time() - _model_cache_timestamp > MODEL_CACHE_TIMEOUT:
	print("[Model] 모델 캐시 만료, 재로드 필요")
	_current_model = None
	return None
	return _current_model


	def _maybe_load_latest_model() -> None:
	try:
	manifest = _scheduler.get_model_versions()
	target_entry = manifest[-1] if manifest else None
	candidate_path: Optional[Path] = None
	candidate_version: Optional[int] = None

	if target_entry:
	candidate_path = Path(target_entry["path"])
	candidate_version = target_entry.get("version")
	else:
	default_path = Path("models/lightgbm_model.txt")
	if default_path.exists():
	candidate_path = default_path

	if candidate_path and candidate_path.exists():
	try:
	_load_model_from_path(candidate_path, candidate_version)
	print(f"[Model] 모델 로드 성공: {candidate_path}")
	except Exception as exc:
	print(f"[Model] 모델 로드 실패 (계속 진행): {exc}")
	else:
	print("[Model] 로드할 모델이 아직 없습니다.")
	except Exception as exc:
	print(f"[Model] 모델 로드 과정에서 예외 발생: {exc}")


	def _get_active_model() -> Tuple[lgb.Booster, Optional[int]]:
	# 먼저 캐시된 모델 확인
	cached_model = _get_cached_model()
	if cached_model is not None:
	return cached_model, _current_model_version

	# 캐시된 모델이 없으면 최신 모델 로드 시도
	try:
	manifest = _scheduler.get_model_versions()
	target_entry = manifest[-1] if manifest else None

	if target_entry:
	path = Path(target_entry["path"])
	version = target_entry.get("version")
	else:
	path = Path("models/lightgbm_model.txt")

	if path.exists():
	_load_model_from_path(path, version)
	return _current_model, _current_model_version
	else:
	raise HTTPException(status_code=503, detail="모델 파일을 찾을 수 없습니다.")
	except Exception as exc:
	raise HTTPException(status_code=503, detail=f"모델 로드 실패: {exc}")


	def _build_feature_vector(payload: PredictRequest) -> np.ndarray:
	rms_base = payload.rms_base if payload.rms_base != 0 else 1e-6
	freq_mean = (payload.mean_freq_acc + payload.mean_freq_gyro) / 2.0
	if freq_mean == 0:
	freq_mean = 1e-6

	rms_ratio = ((payload.rms_acc + payload.rms_gyro) / 2.0) / rms_base
	freq_ratio = payload.freq_base / freq_mean

	feature_vector = [rms_ratio, freq_ratio, *payload.user_emb]
	return np.asarray([feature_vector], dtype=np.float32)


	@app.on_event("startup")
	def on_startup() -> None:
	print("[Startup] MuscleCare Space 시작 중...")
	try:
	_schedule_background_job()
	print("[Startup] 스케줄러 초기화 완료")
	except Exception as exc:
	print(f"[Startup] 스케줄러 초기화 실패 (계속 진행): {exc}")

	# 모델 업데이트 시도 (최신 데이터로 모델 학습)
	print("[Startup] 모델 업데이트 시도 중...")
	try:
	# 기존 모델 확인
	manifest = _scheduler.get_model_versions()
	has_existing_model = len(manifest) > 0
	print(f"[Startup] 기존 모델 존재: {has_existing_model}")

	if not has_existing_model:
	print("[Startup] 기존 모델이 없어 초기 학습을 수행합니다...")
	result = _scheduler.run_scheduled_training()
	if result.get("status") == "trained":
	_apply_training_result(result)
	print("[Startup] ✅ 초기 학습 완료")
	else:
	print(f"[Startup] ⚠️ 초기 학습 실패: {result.get('message', '알 수 없는 오류')}")
	else:
	print("[Startup] 기존 모델이 있어 업데이트를 건너뜁니다 (필요시 /trigger 호출)")

	except Exception as exc:
	print(f"[Startup] 모델 업데이트 실패 (계속 진행): {exc}")

	print("[Startup] MuscleCare Space 시작 완료")

	@app.head("/health")
	async def health_head():
	return None # HEAD는 바디가 필요 없으므로 None 반환

	@app.get("/health")
	def health_check() -> dict:
	"""
	시스템 헬스체크 API
	- 모델 상태
	- 시스템 리소스
	- 파일 상태
	- 최근 학습 정보
	"""
	import time
	from pathlib import Path

	# 기본 상태
	health_status = {
	"status": "ok",
	"timestamp": time.time(),
	"environment": os.getenv("ENVIRONMENT", "development"),
	"version": "1.0.0"
	}

	try:
	# 모델 상태 확인
	try:
	cached_model = _get_cached_model()
	health_status["model_loaded"] = cached_model is not None
	if _model_cache_timestamp:
	health_status["model_cache_age_seconds"] = int(time.time() - _model_cache_timestamp)
	except Exception as e:
	health_status["model_loaded"] = False
	health_status["model_error"] = str(e)

	# 모델 파일 존재 여부
	try:
	model_files = list(Path("models").glob("*.txt"))
	health_status["model_files_count"] = len(model_files)
	if model_files:
	latest_model = max(model_files, key=lambda x: x.stat().st_mtime)
	health_status["latest_model_file"] = latest_model.name
	except Exception as e:
	health_status["model_files_count"] = 0
	health_status["model_files_error"] = str(e)

	# 로그 파일 존재 여부
	try:
	log_files = list(Path("logs").glob("*.json"))
	health_status["log_files_count"] = len(log_files)
	except Exception as e:
	health_status["log_files_count"] = 0
	health_status["log_files_error"] = str(e)

	# 최근 학습 상태
	try:
	manifest = _scheduler.get_model_versions()
	if manifest:
	latest = manifest[-1]
	health_status["latest_model_version"] = latest.get("version")
	health_status["latest_training_time"] = latest.get("timestamp")
	health_status["total_sessions_trained"] = sum(m.get("session_count", 0) for m in manifest)
	else:
	health_status["latest_model_version"] = None
	health_status["total_sessions_trained"] = 0
	except Exception as e:
	health_status["training_status_error"] = str(e)

	# API 엔드포인트 상태
	endpoints_status = {
	"predict": "available",
	"trigger": "available",
	"model": "available",
	"update-model": "available",
	"health": "available"
	}

	# 환경에 따른 state_reset 상태
	if os.getenv("ENVIRONMENT") != "production":
	endpoints_status["state_reset"] = "available"
	else:
	endpoints_status["state_reset"] = "disabled_in_production"

	health_status["endpoints"] = endpoints_status

	# 시스템 리소스 (간단 버전)
	try:
	# 프로세스 정보 (기본적인)
	health_status["process_id"] = os.getpid()
	health_status["working_directory"] = os.getcwd()
	except Exception as e:
	health_status["system_error"] = str(e)

	except Exception as e:
	health_status["status"] = "degraded"
	health_status["error"] = str(e)
	# 에러가 발생해도 기본 정보는 유지

	return health_status


	@app.get("/")
	def root() -> dict:
	endpoints = {
	"trigger": "/trigger",
	"model": "/model",
	"update-model": "/update-model",
	"predict": "/predict",
	"health": "/health"
	}

	return {
	"message": "MuscleCare LightGBM Scheduler API",
	"docs": "/docs",
	"endpoints": endpoints,
	"environment": os.getenv("ENVIRONMENT", "development"),
	}


	def _upload_to_hub(model_path: str) -> Optional[str]:
	token = os.getenv("HF_HYBRID_MODEL_TOKEN")
	repo_id = os.getenv("HF_HYBRID_MODEL_REPO_ID")

	print(f"[Upload] Model Hub 업로드 시도: {model_path}")

	if not token or not repo_id:
	print(f"[Upload] 환경변수 누락: TOKEN={'***' if token else 'None'}, REPO_ID={repo_id}")
	return None

	path = Path(model_path)
	if not path.exists():
	raise HTTPException(status_code=404, detail=f"모델 파일을 찾을 수 없습니다: {model_path}")

	try:
	print(f"[Upload] 리포지토리 생성/확인: {repo_id}")
	api = HfApi(token=token)
	api.create_repo(repo_id=repo_id, repo_type="model", private=False, exist_ok=True)

	print(f"[Upload] 모델 파일 업로드: {path.name}")
	api.upload_file(
	path_or_fileobj=path,
	path_in_repo=path.name,
	repo_id=repo_id,
	repo_type="model",
	commit_message=f"LightGBM model upload ({path.name})",
	)

	manifest_path = Path("logs/model_versions.json")
	if manifest_path.exists():
	print(f"[Upload] 메타데이터 파일 업로드")
	api.upload_file(
	path_or_fileobj=str(manifest_path),
	path_in_repo="model_versions.json",
	repo_id=repo_id,
	repo_type="model",
	commit_message="Update model manifest",
	)

	hub_url = f"https://huggingface.co/{repo_id}"
	print(f"[Upload] ✅ 업로드 성공: {hub_url}")
	return hub_url

	except Exception as exc:
	print(f"[Upload] ❌ 업로드 실패: {exc}")
	raise


	def _resolve_model_entry(version: Optional[int] = None) -> Dict[str, Any]:
	manifest = _scheduler.get_model_versions()
	if not manifest:
	raise HTTPException(status_code=404, detail="아직 학습된 모델이 없습니다.")

	if version is None:
	return manifest[-1]

	for entry in manifest:
	if entry.get("version") == version:
	return entry

	raise HTTPException(
	status_code=404,
	detail=f"버전 {version} 모델을 찾을 수 없습니다.",
	)


	@app.get("/model")
	@app.get("/model/{version:int}")
	def download_model(version: Optional[int] = None) -> FileResponse:
	entry = _resolve_model_entry(version)
	path = Path(entry["path"])
	if not path.exists():
	raise HTTPException(status_code=404, detail="모델 파일을 찾을 수 없습니다.")

	response = FileResponse(
	path=path,
	filename=entry["filename"],
	media_type="application/octet-stream",
	)
	response.headers["X-Model-Version"] = str(entry["version"])
	return response


	@app.get("/download")
	def download_latest_alias() -> FileResponse:
	return download_model()


	# 프로덕션 환경에서는 reset API 비활성화
	environment = os.getenv("ENVIRONMENT", "development")
	if environment != "production":
	print(f"[Security] State reset API enabled (environment: {environment})")
	@app.post("/state/reset", response_model=ResetStateResponse)
	def reset_state() -> ResetStateResponse:
	print("[Security] State reset requested")
	state = _scheduler.reset_training_state()
	return ResetStateResponse(status="reset", state=state)
	else:
	print(f"[Security] State reset API disabled in production environment")


	@app.post("/trigger", response_model=TrainResponse)
	def trigger_training(upload: bool = False) -> TrainResponse:
	try:
	result = _scheduler.run_scheduled_training()
	except Exception as exc: # pragma: no cover
	raise HTTPException(status_code=500, detail=f"학습 실행 오류: {exc}") from exc

	message = "모델 학습이 완료되었습니다." if result["status"] == "trained" else "학습이 건너뛰어졌습니다."
	hub_url = None
	model_version = result.get("model_version")
	model_path = result.get("model_path")

	if upload and model_path and result["status"] == "trained":
	try:
	hub_url = _upload_to_hub(model_path)
	message = "모델 학습 및 Hugging Face 업로드가 완료되었습니다."
	except HTTPException:
	raise
	except Exception as exc: # pragma: no cover
	raise HTTPException(status_code=500, detail=f"Hugging Face 업로드 실패: {exc}") from exc

	_apply_training_result(result)

	return TrainResponse(
	status=result["status"],
	new_data_count=result.get("new_data_count", 0),
	model_path=model_path,
	hub_url=hub_url,
	model_version=model_version,
	message=message,
	new_session_count=result.get("new_session_count"),
	)


	@app.post("/train", response_model=TrainResponse)
	def trigger_training_alias(upload: bool = False) -> TrainResponse:
	return trigger_training(upload=upload)


	@app.post("/update-model", response_model=TrainResponse)
	def update_model(force: bool = False) -> TrainResponse:
	"""
	모델을 강제로 업데이트합니다.
	- force=true: 기존 모델이 있어도 업데이트
	- force=false: 새로운 데이터가 있을 때만 업데이트
	"""
	try:
	print(f"[Update] 모델 업데이트 요청 (force={force})")

	if force:
	# 강제 업데이트: 기존 모델 무시하고 새로 학습
	print("[Update] 강제 업데이트 모드")
	# 임시로 기존 모델을 백업
	manifest = _scheduler.get_model_versions()
	if manifest:
	print(f"[Update] 기존 모델 {len(manifest)}개 백업됨")

	result = _scheduler.run_scheduled_training()

	if result.get("status") == "trained":
	_apply_training_result(result)
	message = "✅ 모델 업데이트 완료"
	else:
	message = f"⚠️ 모델 업데이트 건너뜀: {result.get('message', '새로운 데이터 없음')}"

	return TrainResponse(
	status=result["status"],
	new_data_count=result.get("new_data_count", 0),
	model_path=result.get("model_path"),
	hub_url=None, # 업데이트 시에는 Hub 업로드 하지 않음
	model_version=result.get("model_version"),
	message=message,
	new_session_count=result.get("new_session_count"),
	)

	except Exception as exc:
	raise HTTPException(status_code=500, detail=f"모델 업데이트 실패: {exc}") from exc


	@app.post("/predict", response_model=PredictResponse)
	def predict(payload: PredictRequest) -> PredictResponse:
	booster, version = _get_active_model()
	features = _build_feature_vector(payload)
	prediction = booster.predict(features)[0]
	return PredictResponse(fatigue=float(prediction), model_version=version)


	__all__ = ["app"]