vidhi0405 commited on
Commit
855c74b
·
0 Parent(s):

Initial deployment

Browse files
Files changed (8) hide show
  1. .dockerignore +9 -0
  2. .gitattributes +36 -0
  3. .gitignore +5 -0
  4. Dockerfile +23 -0
  5. README.md +15 -0
  6. app.py +299 -0
  7. model.py +69 -0
  8. requirements.txt +9 -0
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .env
7
+ .venv/
8
+ venv/
9
+ *.log
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ hint.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ .venv/
5
+ .venv310/
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ # Install system dependencies for soundfile
8
+ RUN apt-get update && apt-get install -y libsndfile1 && rm -rf /var/lib/apt/lists/*
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . /code
13
+
14
+ # Set up a new user named "user" with user ID 1000
15
+ RUN useradd -m -u 1000 user
16
+ USER user
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ WORKDIR $HOME/app
21
+ COPY --chown=user . $HOME/app
22
+
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: tts Text To Speech
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: docker
7
+ sdk_version: 5.25.2
8
+ python_version: 3.10.0
9
+ app_file: app.py
10
+ pinned: false
11
+ short_description: Text-to-speech (TTS) with Next-gen Kaldi
12
+ license: apache-2.0
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import base64
4
+ import io
5
+ import os
6
+ import time
7
+ import uuid
8
+ from datetime import datetime
9
+ from functools import lru_cache
10
+ from typing import Optional
11
+
12
+ import gradio as gr
13
+ import pymongo
14
+ import soundfile as sf
15
+ from bson.binary import Binary
16
+ from bson.objectid import ObjectId
17
+ from dotenv import load_dotenv
18
+ from fastapi import Body, FastAPI, Form, Request, Response
19
+ from pydantic import BaseModel
20
+
21
+ from model import ENGLISH_REPO_ID, get_pretrained_model
22
+
23
+ load_dotenv()
24
+
25
+ MONGO_URI = os.getenv("MONGO_URI", "").strip()
26
+ MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "image_to_speech").strip()
27
+ MONGO_COLLECTION = os.getenv("MONGO_COLLECTION", "audio").strip()
28
+ MONGO_CAPTIONS_COLLECTION = os.getenv("MONGO_CAPTIONS_COLLECTION", "captions").strip()
29
+
30
+
31
+ def log(msg: str) -> None:
32
+ now = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
33
+ print(f"{now}: {msg}")
34
+
35
+
36
+ def build_html_output(msg: str, style: str = "result_item_success") -> str:
37
+ return f"""
38
+ <div class='result'>
39
+ <div class='result_item {style}'>
40
+ {msg}
41
+ </div>
42
+ </div>
43
+ """
44
+
45
+
46
+ @lru_cache(maxsize=1)
47
+ def _get_mongo_client():
48
+ if not MONGO_URI:
49
+ raise ValueError("MONGO_URI is missing in .env")
50
+ return pymongo.MongoClient(MONGO_URI)
51
+
52
+
53
+ def _get_mongo_collection():
54
+ client = _get_mongo_client()
55
+ return client[MONGO_DB_NAME][MONGO_COLLECTION]
56
+
57
+
58
+ def _get_captions_collection():
59
+ client = _get_mongo_client()
60
+ return client[MONGO_DB_NAME][MONGO_CAPTIONS_COLLECTION]
61
+
62
+
63
+ def _as_opus_bytes(samples, sample_rate: int) -> bytes:
64
+ buffer = io.BytesIO()
65
+ sf.write(buffer, samples, samplerate=sample_rate, format="OGG", subtype="OPUS")
66
+ return buffer.getvalue()
67
+
68
+
69
+ def _save_audio_to_db(
70
+ samples,
71
+ sample_rate: int,
72
+ caption_id: Optional[str] = None,
73
+ caption: Optional[str] = None,
74
+ ) -> dict:
75
+ audio_id = str(uuid.uuid4())
76
+ duration = len(samples) / sample_rate
77
+ opus_bytes = _as_opus_bytes(samples, sample_rate)
78
+ doc = {
79
+ "audio_id": audio_id,
80
+ "audio_file": Binary(opus_bytes),
81
+ "sample_rate": int(sample_rate),
82
+ "duration_seconds": float(duration),
83
+ "audio_format": "opus",
84
+ "created_at": datetime.utcnow(),
85
+ }
86
+ if caption_id:
87
+ doc["caption_id"] = caption_id
88
+ if caption:
89
+ doc["caption"] = caption
90
+
91
+ inserted = _get_mongo_collection().insert_one(doc)
92
+ return {
93
+ "audio_file_id": str(inserted.inserted_id),
94
+ "audio_id": audio_id,
95
+ "sample_rate": int(sample_rate),
96
+ "duration_seconds": float(duration),
97
+ "caption_id": caption_id,
98
+ "caption": caption,
99
+ }
100
+
101
+
102
+ def _generate_audio_from_text(
103
+ text: str,
104
+ sid: int,
105
+ speed: float,
106
+ caption_id: Optional[str] = None,
107
+ ) -> dict:
108
+ tts = get_pretrained_model(ENGLISH_REPO_ID, speed)
109
+ audio = tts.generate(text, sid=sid)
110
+ if len(audio.samples) == 0:
111
+ raise ValueError("No audio was generated.")
112
+ return _save_audio_to_db(
113
+ audio.samples,
114
+ audio.sample_rate,
115
+ caption_id=caption_id,
116
+ caption=text,
117
+ )
118
+
119
+
120
+ def process(text: str, sid: str, speed: float):
121
+ max_len = 4000
122
+ log(f"Input text {len(text)}: {text[:max_len]}. sid: {sid}, speed: {speed}")
123
+ if len(text) > max_len:
124
+ info = (
125
+ "Text is too long for this demo. Please keep it under 4000 characters."
126
+ )
127
+ return None, build_html_output(info, style="result_item_error")
128
+
129
+ tts = get_pretrained_model(ENGLISH_REPO_ID, speed)
130
+
131
+ start = time.time()
132
+ audio = tts.generate(text, sid=int(sid))
133
+ end = time.time()
134
+
135
+ if len(audio.samples) == 0:
136
+ raise ValueError("No audio was generated.")
137
+
138
+ duration = len(audio.samples) / audio.sample_rate
139
+ elapsed = end - start
140
+ rtf = elapsed / duration
141
+ info = (
142
+ f"Wave duration: {duration:.3f}s<br/>"
143
+ f"Processing time: {elapsed:.3f}s<br/>"
144
+ f"RTF: {elapsed:.3f}/{duration:.3f} = {rtf:.3f}<br/>"
145
+ )
146
+
147
+ saved = _save_audio_to_db(audio.samples, audio.sample_rate)
148
+ info += f"Audio ID: {saved['audio_id']}<br/>Saved to MongoDB"
149
+ return (audio.sample_rate, audio.samples), build_html_output(info)
150
+
151
+
152
+ css = """
153
+ .result {display:flex;flex-direction:column}
154
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
155
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
156
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
157
+ """
158
+
159
+
160
+ with gr.Blocks(css=css) as demo:
161
+ gr.Markdown("#Text-to-Speech")
162
+ gr.Markdown(f"Model: `{ENGLISH_REPO_ID}`")
163
+
164
+ input_text = gr.Textbox(
165
+ label="Input text (English)",
166
+ lines=3,
167
+ placeholder="Type English text here...",
168
+ )
169
+ input_sid = gr.Textbox(
170
+ label="Speaker ID",
171
+ lines=1,
172
+ max_lines=1,
173
+ value="0",
174
+ placeholder="0 to 10",
175
+ )
176
+ input_speed = gr.Slider(
177
+ minimum=0.1,
178
+ maximum=10,
179
+ value=1,
180
+ step=0.1,
181
+ label="Speed",
182
+ )
183
+
184
+ input_button = gr.Button("Submit")
185
+ output_audio = gr.Audio(label="Output")
186
+ output_info = gr.HTML(label="Info")
187
+
188
+ input_button.click(
189
+ process,
190
+ inputs=[input_text, input_sid, input_speed],
191
+ outputs=[output_audio, output_info],
192
+ )
193
+
194
+
195
+ class AudioByIdRequest(BaseModel):
196
+ audio_id: str
197
+ sid: Optional[int] = 0
198
+ speed: Optional[float] = 1.0
199
+
200
+
201
+ api = FastAPI(title="Text-to-Speech API")
202
+
203
+
204
+ def _api_response(succes: bool, messase: str, data):
205
+ return {"succes": succes, "messase": messase, "data": data}
206
+
207
+
208
+ def _find_audio_doc(identifier: str):
209
+ doc = _get_mongo_collection().find_one({"audio_id": identifier})
210
+ if doc:
211
+ return doc
212
+ if ObjectId.is_valid(identifier):
213
+ return _get_mongo_collection().find_one({"_id": ObjectId(identifier)})
214
+ return None
215
+
216
+
217
+ @api.post("/audio/by-id")
218
+ def get_audio_by_id(
219
+ request: Request,
220
+ payload: Optional[AudioByIdRequest] = Body(default=None),
221
+ audio_id: Optional[str] = Form(default=None),
222
+ sid: Optional[int] = Form(default=0),
223
+ speed: Optional[float] = Form(default=1.0),
224
+ ):
225
+ resolved_audio_id = audio_id or (payload.audio_id if payload else None)
226
+ resolved_sid = payload.sid if payload and payload.sid is not None else sid
227
+ resolved_speed = payload.speed if payload and payload.speed is not None else speed
228
+
229
+ if not resolved_audio_id:
230
+ return _api_response(False, "audio_id is required", None)
231
+
232
+ doc = _find_audio_doc(resolved_audio_id)
233
+ if not doc and ObjectId.is_valid(resolved_audio_id):
234
+ doc = _get_mongo_collection().find_one({"caption_id": resolved_audio_id})
235
+ if not doc:
236
+ caption_doc = _get_captions_collection().find_one({"_id": ObjectId(resolved_audio_id)})
237
+ if caption_doc:
238
+ caption_text = str(caption_doc.get("caption", "")).strip()
239
+ if caption_text:
240
+ try:
241
+ saved = _generate_audio_from_text(
242
+ caption_text,
243
+ sid=resolved_sid,
244
+ speed=resolved_speed,
245
+ caption_id=resolved_audio_id,
246
+ )
247
+ doc = _find_audio_doc(saved["audio_id"])
248
+ except Exception as e:
249
+ log(f"Error generating audio from caption {resolved_audio_id}: {e}")
250
+
251
+ if not doc:
252
+ return _api_response(False, "Audio not found", None)
253
+
254
+ audio_bytes = bytes(doc.get("audio_file", b""))
255
+ if not audio_bytes:
256
+ return _api_response(False, "Document found but audio_file is missing", None)
257
+
258
+ resolved_id = str(doc.get("audio_id") or doc.get("_id"))
259
+ audio_url = str(request.base_url) + f"audio/{resolved_id}.opus"
260
+
261
+ return _api_response(
262
+ True,
263
+ "Audio fetched successfully",
264
+ {
265
+ "audio_id": resolved_id,
266
+ "audio_url": audio_url,
267
+ "sample_rate": int(doc.get("sample_rate", 0)),
268
+ "duration_seconds": float(doc.get("duration_seconds", 0.0)),
269
+ "caption": doc.get("caption"),
270
+ },
271
+ )
272
+
273
+
274
+ @api.get("/audio/{audio_id}.opus")
275
+ def stream_audio(audio_id: str):
276
+ doc = _find_audio_doc(audio_id)
277
+ if not doc:
278
+ return Response(status_code=404)
279
+
280
+ audio_bytes = bytes(doc.get("audio_file", b""))
281
+ if not audio_bytes:
282
+ return Response(status_code=404)
283
+
284
+ resolved_id = str(doc.get("audio_id") or doc.get("_id"))
285
+ return Response(
286
+ content=audio_bytes,
287
+ media_type="audio/ogg",
288
+ headers={
289
+ "Content-Disposition": f'inline; filename="{resolved_id}.opus"',
290
+ "Cache-Control": "public, max-age=31536000",
291
+ },
292
+ )
293
+
294
+
295
+ app = gr.mount_gradio_app(api, demo, path="/")
296
+
297
+
298
+ if __name__ == "__main__":
299
+ demo.launch()
model.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from pathlib import Path
3
+
4
+ import sherpa_onnx
5
+ from huggingface_hub import hf_hub_download, snapshot_download
6
+
7
+ ENGLISH_REPO_ID = "vidhi0405/TextToSpeech"
8
+
9
+
10
+ def _normalize_repo_id(repo_id: str) -> str:
11
+ v = repo_id.strip()
12
+ if v.startswith("https://huggingface.co/"):
13
+ v = v.removeprefix("https://huggingface.co/").strip("/")
14
+ return v
15
+
16
+
17
+ def _get_file(repo_id: str, filename: str, subfolder: str) -> str:
18
+ return hf_hub_download(
19
+ repo_id=repo_id,
20
+ filename=filename,
21
+ subfolder=subfolder,
22
+ )
23
+
24
+
25
+ @lru_cache(maxsize=2)
26
+ def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
27
+ source_repo = _normalize_repo_id(repo_id)
28
+ if source_repo != ENGLISH_REPO_ID:
29
+ raise ValueError(f"Unsupported repo_id: {repo_id}. Use {ENGLISH_REPO_ID}")
30
+
31
+ model = _get_file(
32
+ repo_id=source_repo,
33
+ filename="model.onnx",
34
+ subfolder="kokoro-en-v0_19",
35
+ )
36
+ tokens = _get_file(
37
+ repo_id=source_repo,
38
+ filename="tokens.txt",
39
+ subfolder="kokoro-en-v0_19",
40
+ )
41
+ voices = _get_file(
42
+ repo_id=source_repo,
43
+ filename="voices.bin",
44
+ subfolder="kokoro-en-v0_19",
45
+ )
46
+
47
+ root_dir = snapshot_download(
48
+ repo_id=source_repo,
49
+ allow_patterns=["kokoro-en-v0_19/espeak-ng-data/*"],
50
+ )
51
+ data_dir = str(Path(root_dir) / "kokoro-en-v0_19" / "espeak-ng-data")
52
+
53
+ tts_config = sherpa_onnx.OfflineTtsConfig(
54
+ model=sherpa_onnx.OfflineTtsModelConfig(
55
+ kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
56
+ model=model,
57
+ voices=voices,
58
+ tokens=tokens,
59
+ data_dir=data_dir,
60
+ length_scale=1.0 / speed,
61
+ ),
62
+ provider="cpu",
63
+ debug=True,
64
+ num_threads=2,
65
+ ),
66
+ max_num_sentences=1,
67
+ )
68
+ return sherpa_onnx.OfflineTts(tts_config)
69
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ gradio
4
+ pymongo
5
+ soundfile
6
+ python-dotenv
7
+ pydantic
8
+ sherpa-onnx
9
+ huggingface_hub