Osi30 commited on
Commit
1caa8b9
·
1 Parent(s): bc8dd96

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -0
  2. main.py +92 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim-buster
2
+
3
+ # Cài đặt thư viện hệ thống cần thiết cho soundfile
4
+ RUN apt-get update && apt-get install -y libsndfile1 && rm -rf /var/lib/apt/lists/*
5
+
6
+ # Tạo user mới để phù hợp với chính sách bảo mật của Hugging Face
7
+ RUN useradd -m -u 1000 user
8
+ USER user
9
+ ENV PATH="/home/user/.local/bin:${PATH}"
10
+
11
+ WORKDIR /app
12
+
13
+ # Copy và cài đặt thư viện
14
+ COPY --chown=user requirements.txt .
15
+ RUN pip install --no-cache-dir --upgrade pip && \
16
+ pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy toàn bộ code
19
+ COPY --chown=user . .
20
+
21
+ # Hugging Face Spaces mặc định chạy trên port 7860
22
+ EXPOSE 7860
23
+
24
+ # Chạy ứng dụng trên port 7860
25
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from transformers import VitsModel, AutoTokenizer
4
+ import torch
5
+ import soundfile as sf
6
+ import io
7
+ import base64
8
+ import os
9
+
10
+ app = FastAPI(
11
+ title="MMS-TTS Vietnamese API",
12
+ description="A simple API for Vietnamese Text-to-Speech using facebook/mms-tts-vie."
13
+ )
14
+
15
+ # Define the request body model
16
+ class TTSRequest(BaseModel):
17
+ text: str
18
+ speaker_id: int = 0 # MMS-TTS models are single-speaker by default, but keeping this for potential future multi-speaker models
19
+ speed_factor: float = 1.0 # Optional: Adjust speech speed
20
+
21
+ # Global variables to hold the loaded model and tokenizer
22
+ # Avoid reloading them for every request, improving performance.
23
+ model = None
24
+ tokenizer = None
25
+
26
+ @app.on_event("startup")
27
+ async def startup_event():
28
+ """
29
+ Load the TTS model and tokenizer when the FastAPI application starts up.
30
+ This ensures they are ready for immediate use and not reloaded per request.
31
+ """
32
+ global model, tokenizer
33
+ try:
34
+ print("Loading MMS-TTS model 'facebook/mms-tts-vie'...")
35
+ model = VitsModel.from_pretrained("facebook/mms-tts-vie")
36
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-vie")
37
+ print("MMS-TTS model and tokenizer loaded successfully.")
38
+ except Exception as e:
39
+ # Log the full exception for debugging
40
+ import traceback
41
+ traceback.print_exc()
42
+ print(f"ERROR: Failed to load MMS-TTS model or tokenizer on startup: {e}")
43
+ model = None # Ensure they are clearly unset if loading fails
44
+ tokenizer = None
45
+
46
+ @app.get("/")
47
+ async def read_root():
48
+ """Basic health check endpoint."""
49
+ return {"message": "MMS-TTS Vietnamese API is running!", "model_loaded": model is not None}
50
+
51
+
52
+ @app.post("/synthesize_speech")
53
+ async def synthesize_speech(request: TTSRequest):
54
+ """
55
+ Synthesizes speech from the given text and returns it as a Base64 encoded WAV byte array.
56
+ """
57
+ if model is None or tokenizer is None:
58
+ raise HTTPException(
59
+ status_code=503,
60
+ detail="TTS model is not loaded. Please try again later or check server logs."
61
+ )
62
+
63
+ try:
64
+ # Tokenize the input text
65
+ # Ensure the text is properly handled by the tokenizer for Vietnamese
66
+ inputs = tokenizer(request.text, return_tensors="pt")
67
+
68
+ # Generate speech
69
+ with torch.no_grad():
70
+ # speaker_id is usually not needed for single-speaker models like mms-tts-vie
71
+ # if model supports it, you might pass speaker_id=request.speaker_id
72
+ audio_values = model(**inputs).waveform
73
+
74
+ # Convert to WAV bytes in memory
75
+ # The sampling rate is critical and should match the model's config
76
+ samplerate = model.config.sampling_rate
77
+ output_buffer = io.BytesIO()
78
+ sf.write(output_buffer, audio_values.numpy().squeeze(), samplerate, format='WAV')
79
+ output_buffer.seek(0) # Rewind the buffer to the beginning
80
+
81
+ # Encode the WAV bytes to Base64 string
82
+ audio_base64 = base64.b64encode(output_buffer.read()).decode('utf-8')
83
+
84
+ return {"audio_base64": audio_base64}
85
+
86
+ except Exception as e:
87
+ import traceback
88
+ traceback.print_exc() # Print full traceback to console for debugging
89
+ raise HTTPException(
90
+ status_code=500,
91
+ detail=f"An error occurred during speech synthesis: {e}"
92
+ )
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ torch
3
+ transformers
4
+ fastapi
5
+ uvicorn
6
+ pydantic
7
+ soundfile