Spaces:

speako
/

cosyvoice2-server

Build error

App Files Files Community

cosyvoice2-server / app.py

parkjihye

Update app.py

5bd0edb verified 10 months ago

raw

history blame contribute delete

21.6 kB

	# app.py
	from fastapi import FastAPI, Request, File, UploadFile, Form
	from fastapi.responses import HTMLResponse, FileResponse
	from pydantic import BaseModel
	import tempfile
	import os
	import sys
	import subprocess
	import traceback
	import torchaudio
	from modelscope import snapshot_download
	import threading

	# 전역 락 객체 생성
	tts_lock = threading.Lock()


	# ---------------- CosyVoice 경로 설정 ----------------
	sys.path.append('/app/model')
	sys.path.append('/app/model/third_party/Matcha-TTS')

	from cosyvoice.cli.cosyvoice import CosyVoice2
	from cosyvoice.utils.file_utils import load_wav

	# ---------------- 전역 변수 ----------------
	cosyvoice_model = None

	# ---------------- 모델 초기화 함수 ----------------
	def initialize_cosyvoice():
	"""CosyVoice2 모델을 초기화합니다."""
	global cosyvoice_model

	try:
	print("=== CosyVoice2 모델 초기화 시작 ===")

	# 작업 디렉토리를 cosyvoice 모듈 위치로 변경
	original_cwd = os.getcwd()
	cosyvoice_dir = '/app/model/cosyvoice'

	print(f"작업 디렉토리 변경: {original_cwd} -> {cosyvoice_dir}")
	os.chdir(cosyvoice_dir)

	# 모델 경로 확인
	model_path = '/app/pretrained_models/CosyVoice2-0.5B'
	ttsfrd_path = '/app/pretrained_models/CosyVoice-ttsfrd'
	resource_path = '/app/pretrained_models/CosyVoice-ttsfrd/resource'

	print(f"모델 경로 확인: {model_path}")
	print(f"모델 경로 존재: {os.path.exists(model_path)}")

	print(f"ttsfrd 경로 확인: {ttsfrd_path}")
	print(f"ttsfrd 경로 존재: {os.path.exists(ttsfrd_path)}")

	print(f"리소스 경로 확인: {resource_path}")
	print(f"리소스 경로 존재: {os.path.exists(resource_path)}")

	if os.path.exists(ttsfrd_path):
	print("ttsfrd 디렉토리 내용:")
	for item in os.listdir(ttsfrd_path):
	item_path = os.path.join(ttsfrd_path, item)
	print(f" {item} ({'dir' if os.path.isdir(item_path) else 'file'})")

	if os.path.exists(resource_path):
	print("resource 디렉토리 내용:")
	for item in os.listdir(resource_path):
	print(f" {item}")

	if not os.path.exists(model_path):
	print(f"❌ 모델 경로가 존재하지 않습니다: {model_path}")
	return False

	if not os.path.exists(resource_path):
	print(f"❌ 리소스 경로가 존재하지 않습니다: {resource_path}")
	return False

	# ROOT_DIR 기준 상대 경로 확인
	expected_resource_path = os.path.join(os.getcwd(), '../../pretrained_models/CosyVoice-ttsfrd/resource')
	normalized_path = os.path.normpath(expected_resource_path)
	print(f"CosyVoice가 찾는 리소스 경로: {normalized_path}")
	print(f"해당 경로 존재 여부: {os.path.exists(normalized_path)}")

	# 모델 로드
	print("CosyVoice2 모델 로드 중...")
	cosyvoice_model = CosyVoice2(
	model_path,
	load_jit=False,
	load_trt=False,
	fp16=False,
	)

	# 작업 디렉토리 복원
	os.chdir(original_cwd)

	print("✅ CosyVoice2 모델 초기화 완료!")
	return True

	except Exception as e:
	# 작업 디렉토리 복원
	try:
	os.chdir(original_cwd)
	except:
	pass

	print(f"❌ 모델 초기화 실패: {str(e)}")
	traceback.print_exc()
	return False

	# ---------------- 서버 시작 시 모델 초기화 ----------------
	from contextlib import asynccontextmanager

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	"""서버 시작 시 모델을 초기화합니다."""
	print("🚀 서버 시작 - 모델 초기화 중...")
	initialize_cosyvoice()
	yield

	# FastAPI 앱에 lifespan 적용
	app = FastAPI(
	title="CosyVoice2 Korean TTS API",
	description="FastAPI + CosyVoice2 기반 한국어 음성 합성 서버",
	version="1.0.0",
	lifespan=lifespan
	)

	# ---------------- 입력/출력 모델 ----------------
	class TTSRequest(BaseModel):
	text: str
	prompt_text: str

	class TTSResponse(BaseModel):
	status: str
	message: str
	audio_path: str = None

	# ---------------- API: JSON POST ----------------
	@app.post("/synthesize", response_model=TTSResponse)
	async def synthesize_speech(request: TTSRequest, prompt_audio: UploadFile = File(...)):
	"""
	음성 합성 API
	- text: 합성할 텍스트
	- prompt_text: 프롬프트 음성의 텍스트
	- prompt_audio: 프롬프트 음성 파일 (wav, mp3, flac 등)
	"""
	if cosyvoice_model is None:
	return TTSResponse(
	status="error",
	message="모델이 초기화되지 않았습니다. 서버 로그를 확인해주세요."
	)

	try:
	# 임시 파일로 프롬프트 음성 저장 (확장자 유지)
	temp_file_extension = os.path.splitext(prompt_audio.filename)[1].lower()
	if not temp_file_extension:
	temp_file_extension = '.wav' # 기본값

	with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_extension) as temp_file:
	temp_file.write(await prompt_audio.read())
	temp_path = temp_file.name

	# 프롬프트 음성 로드 (16kHz)
	try:
	prompt_speech_16k = load_wav(temp_path, 16000)
	except Exception as e:
	print(f"load_wav 실패: {e}")
	# fallback: librosa 직접 사용
	import librosa
	import torch
	audio_data, sr = librosa.load(temp_path, sr=16000)
	prompt_speech_16k = torch.from_numpy(audio_data).unsqueeze(0)

	# 음성 합성 실행
	results_generator = cosyvoice_model.inference_zero_shot(
	request.text,
	prompt_text=request.prompt_text,
	prompt_speech_16k=prompt_speech_16k,
	text_frontend=True
	)

	# generator를 리스트로 변환
	results = list(results_generator)

	if not results:
	return TTSResponse(
	status="error",
	message="음성 합성 결과가 비어있습니다."
	)

	# 결과 저장 (출력 디렉토리 지정)
	output_dir = '/app/outputs'
	os.makedirs(output_dir, exist_ok=True)
	output_filename = f'output_{hash(request.text)}.wav'
	output_path = os.path.join(output_dir, output_filename)

	torchaudio.save(output_path, results[0]['tts_speech'], cosyvoice_model.sample_rate)

	# 임시 파일 정리
	os.unlink(temp_path)

	return TTSResponse(
	status="success",
	message="음성 합성이 완료되었습니다.",
	audio_path=f'outputs/{output_filename}'
	)

	except Exception as e:
	return TTSResponse(
	status="error",
	message=f"음성 합성 중 오류가 발생했습니다: {str(e)}"
	)

	# ---------------- 오디오 파일 다운로드 ----------------
	@app.get("/download/{filepath:path}")
	async def download_audio(filepath: str):
	"""합성된 오디오 파일을 다운로드합니다."""
	full_path = os.path.join('/app', filepath)
	if os.path.exists(full_path):
	filename = os.path.basename(filepath)
	return FileResponse(full_path, media_type="audio/wav", filename=filename)
	else:
	return {"error": "파일을 찾을 수 없습니다."}

	# ---------------- HTML UI ----------------
	@app.get("/", response_class=HTMLResponse)
	async def main_ui():
	return """
	<html>
	<head>
	<title>CosyVoice2 Korean TTS</title>
	<meta charset="UTF-8">
	<style>
	body {
	font-family: 'Segoe UI', Arial, sans-serif;
	max-width: 900px;
	margin: auto;
	padding: 2rem;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	min-height: 100vh;
	}
	.container {
	background-color: white;
	padding: 2.5rem;
	border-radius: 15px;
	box-shadow: 0 10px 30px rgba(0,0,0,0.2);
	}
	.header {
	text-align: center;
	margin-bottom: 2rem;
	}
	.header h1 {
	color: #333;
	margin-bottom: 0.5rem;
	}
	.header p {
	color: #666;
	font-size: 1.1rem;
	}
	.form-group {
	margin-bottom: 1.5rem;
	}
	label {
	display: block;
	margin-bottom: 0.5rem;
	font-weight: 600;
	color: #333;
	}
	input[type="text"], textarea {
	width: 100%;
	padding: 0.75rem;
	border: 2px solid #e0e0e0;
	border-radius: 8px;
	font-size: 1rem;
	box-sizing: border-box;
	transition: border-color 0.3s;
	}
	input[type="text"]:focus, textarea:focus {
	outline: none;
	border-color: #667eea;
	}
	input[type="file"] {
	width: 100%;
	padding: 0.75rem;
	border: 2px dashed #ccc;
	border-radius: 8px;
	background-color: #f9f9f9;
	box-sizing: border-box;
	transition: all 0.3s;
	}
	input[type="file"]:hover {
	border-color: #667eea;
	background-color: #f0f4ff;
	}
	input[type="submit"] {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 1rem 2rem;
	border: none;
	border-radius: 8px;
	cursor: pointer;
	font-size: 1.1rem;
	font-weight: 600;
	transition: transform 0.2s;
	width: 100%;
	}
	input[type="submit"]:hover {
	transform: translateY(-2px);
	}
	.info {
	background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
	padding: 1.5rem;
	border-radius: 10px;
	margin-bottom: 2rem;
	border-left: 4px solid #667eea;
	}
	.example {
	background-color: #f8f9fa;
	padding: 1rem;
	border-radius: 8px;
	margin-top: 0.5rem;
	border-left: 3px solid #28a745;
	}
	.example strong {
	color: #28a745;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<div class="header">
	<h1>🎤 CosyVoice2 음성 합성기</h1>
	<p>한국어 텍스트를 자연스러운 음성으로 변환해보세요!</p>
	</div>

	<div class="info">
	<strong>📋 사용 방법:</strong><br>
	1. <strong>프롬프트 음성:</strong> 목소리 스타일의 기준이 될 음성 파일을 업로드하세요<br>
	2. <strong>프롬프트 텍스트:</strong> 업로드한 음성의 실제 내용을 입력하세요<br>
	3. <strong>합성할 텍스트:</strong> 새로 생성하고 싶은 음성의 텍스트를 입력하세요<br><br>
	<strong>지원 형식:</strong> WAV
	</div>

	<form action="/submit" method="post" enctype="multipart/form-data">
	<div class="form-group">
	<label for="prompt_audio">🎵 프롬프트 음성 파일:</label>
	<input type="file" id="prompt_audio" name="prompt_audio" accept=".wav" required>
	<div class="example">
	<strong>예시:</strong> "안녕하세요"라고 말하는 음성 파일
	</div>
	</div>

	<div class="form-group">
	<label for="prompt_text">📝 프롬프트 텍스트:</label>
	<input type="text" id="prompt_text" name="prompt_text"
	placeholder="업로드한 음성의 실제 내용"
	value="안녕하세요" required>
	<div class="example">
	<strong>예시:</strong> 안녕하세요 (업로드한 음성 파일의 실제 내용)
	</div>
	</div>

	<div class="form-group">
	<label for="text">🎯 합성할 텍스트:</label>
	<textarea id="text" name="text" rows="3"
	placeholder="새로 생성하고 싶은 음성의 텍스트를 입력하세요"
	required>공룡이 밤양갱을 몰래 먹고 도망쳤어요.</textarea>
	<div class="example">
	<strong>예시:</strong> 공룡이 밤양갱을 몰래 먹고 도망쳤어요.
	</div>
	</div>

	<input type="submit" value="🚀 음성 합성 시작">
	</form>
	</div>
	</body>
	</html>
	"""

	# ---------------- 결과 렌더링 ----------------
	@app.post("/submit", response_class=HTMLResponse)
	async def handle_form(
	request: Request,
	text: str = Form(...),
	prompt_text: str = Form(...),
	prompt_audio: UploadFile = File(...)
	):
	try:
	if cosyvoice_model is None:
	return """
	<html>
	<head><title>에러</title><meta charset="UTF-8"></head>
	<body style="font-family: Arial, sans-serif; max-width: 600px; margin: auto; padding: 2rem;">
	<h1>❌ 모델 초기화 오류</h1>
	<p>CosyVoice2 모델이 아직 초기화되지 않았습니다.</p>
	<p>서버 로그를 확인하고 잠시 후 다시 시도해주세요.</p>
	<br>
	<a href="/" style="color: #667eea; text-decoration: none;">← 돌아가기</a>
	</body>
	</html>
	"""

	# 파일 형식 검증
	if not prompt_audio.filename.lower().endswith('.wav'):
	return """
	<html>
	<head><title>에러</title><meta charset="UTF-8"></head>
	<body style="font-family: Arial, sans-serif; max-width: 600px; margin: auto; padding: 2rem;">
	<h1>❌ 파일 형식 오류</h1>
	<p>WAV 파일만 지원됩니다.</p>
	<p><strong>지원 형식:</strong> WAV</p>
	<br>
	<a href="/" style="color: #667eea; text-decoration: none;">← 돌아가기</a>
	</body>
	</html>
	"""

	# 임시 파일로 프롬프트 음성 저장
	temp_file_extension = os.path.splitext(prompt_audio.filename)[1].lower()
	if not temp_file_extension:
	temp_file_extension = '.wav' # 기본값

	with tempfile.NamedTemporaryFile(delete=False, suffix=temp_file_extension) as temp_file:
	temp_file.write(await prompt_audio.read())
	temp_path = temp_file.name

	print(f"업로드된 파일: {prompt_audio.filename}")
	print(f"임시 파일 경로: {temp_path}")
	print(f"파일 크기: {os.path.getsize(temp_path)} bytes")

	# 프롬프트 음성 로드 (16kHz) - 더 안전한 방법으로
	try:
	prompt_speech_16k = load_wav(temp_path, 16000)
	print(f"오디오 로드 성공: shape={prompt_speech_16k.shape}")
	except Exception as e:
	print(f"load_wav 실패: {e}")
	# fallback: librosa 직접 사용
	import librosa
	import torch
	audio_data, sr = librosa.load(temp_path, sr=16000)
	prompt_speech_16k = torch.from_numpy(audio_data).unsqueeze(0)
	print(f"librosa fallback 성공: shape={prompt_speech_16k.shape}")

	# 음성 합성 실행
	print(f"음성 합성 시작: text='{text}', prompt_text='{prompt_text}'")
	results_generator = cosyvoice_model.inference_zero_shot(
	text,
	prompt_text=prompt_text,
	prompt_speech_16k=prompt_speech_16k,
	text_frontend=True
	)

	# generator를 리스트로 변환
	results = list(results_generator)
	print(f"음성 합성 완료! 결과 개수: {len(results)}")

	if not results:
	raise Exception("음성 합성 결과가 비어있습니다.")

	# 결과 저장 (출력 디렉토리 지정)
	output_dir = '/app/outputs'
	os.makedirs(output_dir, exist_ok=True)
	output_filename = f'korean_tts_output_{hash(text)}.wav'
	output_path = os.path.join(output_dir, output_filename)

	torchaudio.save(output_path, results[0]['tts_speech'], cosyvoice_model.sample_rate)
	print(f"오디오 파일 저장 완료: {output_path}")

	# 다운로드용 상대 경로
	download_filename = f'outputs/{output_filename}'

	# 임시 파일 정리
	os.unlink(temp_path)

	except Exception as e:
	error_details = traceback.format_exc()
	return f"""
	<html>
	<head><title>에러</title><meta charset="UTF-8"></head>
	<body style="font-family: Arial, sans-serif; max-width: 700px; margin: auto; padding: 2rem;">
	<h1>❌ 서버 오류 발생</h1>
	<p><strong>오류 메시지:</strong></p>
	<pre style="background-color: #f8f9fa; padding: 1rem; border-radius: 5px; overflow-x: auto;">{str(e)}</pre>
	<hr>
	<details>
	<summary><strong>에러 상세 (클릭하여 펼치기)</strong></summary>
	<pre style="background-color: #f8f9fa; padding: 1rem; border-radius: 5px; overflow-x: auto;">{error_details}</pre>
	</details>
	<br>
	<a href="/" style="color: #667eea; text-decoration: none;">← 돌아가기</a>
	</body>
	</html>
	"""

	return f"""
	<html>
	<head><title>합성 결과</title><meta charset="UTF-8"></head>
	<body style="font-family: Arial, sans-serif; max-width: 700px; margin: auto; padding: 2rem;">
	<h1>✅ 음성 합성 완료!</h1>

	<div style="background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%); padding: 1.5rem; border-radius: 10px; margin: 1.5rem 0;">
	<h3>📋 입력 정보</h3>
	<p><strong>프롬프트 음성:</strong> {prompt_audio.filename}</p>
	<p><strong>프롬프트 텍스트:</strong> {prompt_text}</p>
	<p><strong>합성할 텍스트:</strong> {text}</p>
	</div>

	<div style="background-color: #f8f9fa; padding: 1.5rem; border-radius: 10px; border-left: 4px solid #28a745;">
	<h3>🎵 합성된 음성</h3>
	<audio controls style="width: 100%; margin: 1rem 0;">
	<source src="/download/{download_filename}" type="audio/wav">
	브라우저가 오디오를 지원하지 않습니다.
	</audio>
	<br>
	<a href="/download/{download_filename}"
	style="background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
	color: white; padding: 0.75rem 1.5rem; text-decoration: none;
	border-radius: 8px; display: inline-block; margin-top: 1rem;">
	📥 파일 다운로드
	</a>
	</div>

	<br>
	<a href="/" style="color: #667eea; text-decoration: none; font-size: 1.1rem;">← 다시 시도하기</a>
	</body>
	</html>
	"""

	# ---------------- 헬스 체크 ----------------
	@app.get("/health")
	async def health_check():
	return {
	"status": "ok" if cosyvoice_model is not None else "initializing",
	"model_loaded": cosyvoice_model is not None,
	"description": "CosyVoice2 Korean TTS Server"
	}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)