Spaces:

speako
/

wav2vec2-server

Sleeping

File size: 8,482 Bytes

3fe0be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82307da
 
 
3fe0be9
 
 
 
82307da
3fe0be9
82307da
3fe0be9
82307da
3fe0be9
 
82307da
 
23ecab6
3fe0be9
 
82307da
3fe0be9
82307da
3fe0be9
 
 
82307da
3fe0be9

from fastapi import FastAPI, Request, File, UploadFile, Form
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
import yaml
import tempfile
import os
import traceback
from model.wav2vec2 import Wav2Vec2

# ---------------- 설정 로드 ----------------
with open("config/wav2vec2.yaml", "r") as f:
    config = yaml.safe_load(f)

# ---------------- 모델 초기화 ----------------
wav2vec2_model = Wav2Vec2(config)

# ---------------- FastAPI 앱 ----------------
app = FastAPI(
    title="Korean Speech Recognition API",
    description="FastAPI + Wav2Vec2 기반 한국어 음성 인식 서버",
    version="1.0.0"
)

# ---------------- 입력 모델 ----------------
class TranscriptionResponse(BaseModel):
    transcription: str
    status: str

# ---------------- API: 파일 업로드 POST ----------------
@app.post("/transcribe", response_model=TranscriptionResponse)
async def transcribe_audio(file: UploadFile = File(...)):
    """오디오 파일을 업로드하여 음성 인식 수행"""
    
    # 파일 형식 검증
    if not file.filename.lower().endswith(('.wav', '.mp3', '.flac', '.m4a')):
        return TranscriptionResponse(
            transcription="",
            status="error: 지원되지 않는 파일 형식입니다. wav, mp3, flac, m4a 파일만 지원됩니다."
        )
    
    try:
        # 파일 내용 읽기
        audio_bytes = await file.read()
        
        # 음성 인식 수행
        result = wav2vec2_model.transcribe_from_bytes(audio_bytes, file.filename)
        
        return TranscriptionResponse(
            transcription=result,
            status="success"
        )
        
    except Exception as e:
        return TranscriptionResponse(
            transcription="",
            status=f"error: {str(e)}"
        )

# ---------------- HTML UI ----------------
@app.get("/", response_class=HTMLResponse)
async def main_ui():
    return """
    <html>
        <head>
            <title>Korean Speech Recognition</title>
            <meta charset="UTF-8">
            <style>
                body {
                    font-family: Arial, sans-serif;
                    max-width: 800px;
                    margin: auto;
                    padding: 2rem;
                    background-color: #f5f5f5;
                }
                .container {
                    background-color: white;
                    padding: 2rem;
                    border-radius: 10px;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
                }
                .form-group {
                    margin-bottom: 1.5rem;
                }
                label {
                    display: block;
                    margin-bottom: 0.5rem;
                    font-weight: bold;
                    color: #333;
                }
                input[type="file"] {
                    padding: 0.5rem;
                    border: 2px dashed #ccc;
                    border-radius: 5px;
                    width: 100%;
                    box-sizing: border-box;
                }
                input[type="submit"] {
                    background-color: #007bff;
                    color: white;
                    padding: 1rem 2rem;
                    border: none;
                    border-radius: 5px;
                    cursor: pointer;
                    font-size: 1rem;
                }
                input[type="submit"]:hover {
                    background-color: #0056b3;
                }
                .info {
                    background-color: #e7f3ff;
                    padding: 1rem;
                    border-radius: 5px;
                    margin-bottom: 1rem;
                    border-left: 4px solid #007bff;
                }
            </style>
        </head>
        <body>
            <div class="container">
                <h1>🎤 한국어 음성 인식</h1>
                <div class="info">
                    <strong>지원 형식:</strong> WAV, MP3, FLAC, M4A<br>
                    <strong>모델:</strong> Wav2Vec2 Korean Fine-tuned
                </div>
                
                <form action="/submit" method="post" enctype="multipart/form-data">
                    <div class="form-group">
                        <label for="audio_file">🎵 오디오 파일 선택:</label>
                        <input type="file" id="audio_file" name="audio_file" accept=".wav,.mp3,.flac,.m4a" required>
                    </div>
                    
                    <input type="submit" value="음성 인식 실행">
                </form>
            </div>
        </body>
    </html>
    """

# ---------------- 결과 렌더링 ----------------
@app.post("/submit", response_class=HTMLResponse)
async def handle_form(request: Request, audio_file: UploadFile = File(...)):
    try:
        # 파일 형식 검증
        if not audio_file.filename.lower().endswith(('.wav', '.mp3', '.flac', '.m4a')):
            return f"""
            <html>
                <head><title>에러</title><meta charset="UTF-8"></head>
                <body style="font-family: Arial, sans-serif; max-width: 600px; margin: auto; padding: 2rem;">
                    <h1>❌ 파일 형식 오류</h1>
                    <p>지원되지 않는 파일 형식입니다.</p>
                    <p><strong>지원 형식:</strong> WAV, MP3, FLAC, M4A</p>
                    <br>
                    <a href="/" style="color: #007bff; text-decoration: none;">← 돌아가기</a>
                </body>
            </html>
            """
        
        # 파일 내용 읽기
        audio_bytes = await audio_file.read()
        
        # 음성 인식 수행
        result = wav2vec2_model.transcribe_from_bytes(audio_bytes, audio_file.filename)
        
    except Exception as e:
        error_details = traceback.format_exc()
        return f"""
        <html>
            <head><title>에러</title><meta charset="UTF-8"></head>
            <body style="font-family: Arial, sans-serif; max-width: 600px; margin: auto; padding: 2rem;">
                <h1>❌ 서버 오류 발생</h1>
                <p><strong>오류 메시지:</strong></p>
                <pre style="background-color: #f8f9fa; padding: 1rem; border-radius: 5px; overflow-x: auto;">{str(e)}</pre>
                <hr>
                <details>
                    <summary><strong>에러 상세 (클릭하여 펼치기)</strong></summary>
                    <pre style="background-color: #f8f9fa; padding: 1rem; border-radius: 5px; overflow-x: auto;">{error_details}</pre>
                </details>
                <br>
                <a href="/" style="color: #007bff; text-decoration: none;">← 돌아가기</a>
            </body>
        </html>
        """
    
    return f"""
    <html>
        <head><title>결과</title><meta charset="UTF-8"></head>
        <body style="font-family: Arial, sans-serif; max-width: 600px; margin: auto; padding: 2rem;">
            <h1>✅ 음성 인식 결과</h1>
            <div style="background-color: #f8f9fa; padding: 1rem; border-radius: 5px; margin: 1rem 0;">
                <p><strong>업로드된 파일:</strong> {audio_file.filename}</p>
                <p><strong>파일 크기:</strong> {len(audio_bytes):,} bytes</p>
            </div>
            <hr>
            <h2>🎯 인식된 텍스트:</h2>
            <div style="background-color: #e7f3ff; padding: 1.5rem; border-radius: 5px; border-left: 4px solid #007bff;">
                <pre style="font-size: 1.1rem; margin: 0; white-space: pre-wrap; word-wrap: break-word;">{result}</pre>
            </div>
            <br>
            <a href="/" style="color: #007bff; text-decoration: none;">← 다시 시도하기</a>
        </body>
    </html>
    """

# ---------------- 헬스 체크 ----------------
@app.get("/health")
async def health_check():
    return {
        "status": "ok",
        "model": config["model"]["id"],
        "device": config["model"]["device"],
        "sampling_rate": config["model"]["sampling_rate"]
    }

# ---------------- 모델 정보 ----------------
@app.get("/info")
async def model_info():
    return {
        "model_id": config["model"]["id"],
        "device": config["model"]["device"],
        "sampling_rate": config["model"]["sampling_rate"],
        "supported_formats": ["wav", "mp3", "flac", "m4a"],
        "description": "Korean Speech Recognition using Wav2Vec2"
    }