Spaces:

gbrabbit
/

lily_fast_api

Sleeping

App Files Files Community

lily_fast_api / lily_llm_api /services /model_service.py

gbrabbit

Auto commit at 25-2025-08 3:12:15

0e9a45c 8 months ago

raw

history blame contribute delete

4.44 kB

	"""
	Model service for Lily LLM API
	"""
	import logging
	import os
	import asyncio
	import concurrent.futures
	from typing import Optional

	logger = logging.getLogger(__name__)

	# 전역 변수들
	current_model = None # 🔄 현재 로드된 모델 인스턴스
	current_profile = None # 🔄 현재 선택된 모델 프로필
	model_loaded = False # 🔄 모델 로드 상태
	model = None
	tokenizer = None
	processor = None
	executor = concurrent.futures.ThreadPoolExecutor()

	def get_current_model():
	"""현재 로드된 모델 반환"""
	return current_model

	def get_current_profile():
	"""현재 선택된 모델 프로필 반환"""
	return current_profile

	def is_model_loaded():
	"""모델 로드 상태 반환"""
	return model_loaded

	async def load_model_async(model_id: str):
	"""모델을 비동기적으로 로딩"""
	loop = asyncio.get_event_loop()
	await loop.run_in_executor(executor, load_model_sync, model_id)

	def load_model_sync(model_id: str):
	"""모델 및 관련 프로세서를 동기적으로 로딩 (최종 수정본)"""
	global model, tokenizer, processor, current_profile, current_model, model_loaded

	try:
	if model is not None:
	logger.info("🗑️ 기존 모델 언로드 중...")
	del model
	del tokenizer
	del processor
	model, tokenizer, processor = None, None, None
	import gc
	gc.collect()
	logger.info("✅ 기존 모델 언로드 완료")

	logger.info(f"📥 '{model_id}' 모델 로딩 시작...")
	from ..models import get_model_profile
	current_profile = get_model_profile(model_id)

	# 이제 load_model은 (model, processor)를 반환합니다.
	model, processor = current_profile.load_model()

	# 🔧 서버 시작 시점에서 dtype 강제 적용 (첫 요청 지연 방지)
	try:
	import torch as _torch
	# 디바이스별 대상 dtype 결정 (기본: CPU=float32, CUDA=bfloat16)
	if hasattr(model, 'device') and str(model.device) == 'cpu':
	desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') or 'float32').lower()
	default_target = _torch.float32
	else:
	desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CUDA_DTYPE') or 'bfloat16').lower()
	default_target = _torch.bfloat16
	desired_map = {
	'float32': _torch.float32,
	'fp32': _torch.float32,
	'bfloat16': _torch.bfloat16,
	'bf16': _torch.bfloat16,
	'float16': _torch.float16,
	'fp16': _torch.float16,
	}
	target_dtype = desired_map.get(desired, default_target)
	if hasattr(model, 'dtype') and model.dtype != target_dtype:
	logger.info(f"🔧 [SPEED][startup] dtype 적용: {model.dtype} -> {target_dtype}")
	model = model.to(target_dtype)
	except Exception as _dtype_e:
	logger.warning(f"⚠️ [startup] dtype 적용 실패: {_dtype_e}")

	# 🔄 전역 변수에 모델 설정 (LoRA에서 사용)
	current_model = model

	# processor에서 tokenizer를 꺼내 전역 변수에 할당합니다.
	if hasattr(processor, 'tokenizer'):
	tokenizer = processor.tokenizer
	else:
	# processor 자체가 tokenizer 역할도 할 수 있는 경우
	tokenizer = processor

	logger.info(f"✅ '{current_profile.display_name}' 모델 로딩 완료!")

	# 🔄 LoRA 기본 모델 자동 로드 (공통 함수 사용)
	try:
	from lily_llm_core.lora_manager import get_lora_manager, lora_manager
	if lora_manager:
	from ..utils.lora_utils import setup_lora_for_model
	setup_lora_for_model(current_profile, lora_manager)
	except ImportError:
	logger.warning("⚠️ LoRA 관리자 import 실패")

	model_loaded = True

	except Exception as e:
	logger.error(f"❌ load_model_sync 실패: {e}")
	import traceback
	logger.error(f"🔍 전체 에러: {traceback.format_exc()}")
	model_loaded = False
	raise

	def shutdown_executor():
	"""스레드 풀 실행기 종료"""
	executor.shutdown(wait=True)