Spaces:
Sleeping
Sleeping
[NOTICKET] feat: endpoint stt and tts
Browse files- main.py +35 -3
- pyproject.toml +1 -0
- src/stt/deepgram_rest.py +32 -0
- uv.lock +11 -0
main.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import uvicorn
|
| 4 |
-
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 5 |
-
from fastapi.responses import JSONResponse
|
|
|
|
| 6 |
from src.pipeline import EchoPipeline
|
| 7 |
-
from src.config import DEEPGRAM_API_KEY, CARTESIA_API_KEY, CARTESIA_VOICE_ID
|
|
|
|
|
|
|
| 8 |
|
| 9 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
| 10 |
logger = logging.getLogger(__name__)
|
|
@@ -32,6 +35,35 @@ async def health() -> JSONResponse:
|
|
| 32 |
return JSONResponse(status_code=200 if all_ready else 503, content=body)
|
| 33 |
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
@app.websocket("/ws/voice")
|
| 36 |
async def voice_ws(ws: WebSocket) -> None:
|
| 37 |
await ws.accept()
|
|
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import uvicorn
|
| 4 |
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, File, UploadFile, HTTPException
|
| 5 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
from src.pipeline import EchoPipeline
|
| 8 |
+
from src.config import DEEPGRAM_API_KEY, CARTESIA_API_KEY, CARTESIA_VOICE_ID, SAMPLE_RATE
|
| 9 |
+
from src.stt.deepgram_rest import transcribe_audio
|
| 10 |
+
from src.tts.cartesia_client import synthesize_stream
|
| 11 |
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
| 13 |
logger = logging.getLogger(__name__)
|
|
|
|
| 35 |
return JSONResponse(status_code=200 if all_ready else 503, content=body)
|
| 36 |
|
| 37 |
|
| 38 |
+
class TTSRequest(BaseModel):
|
| 39 |
+
text: str
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@app.post("/stt")
|
| 43 |
+
async def speech_to_text(audio: UploadFile = File(...)) -> JSONResponse:
|
| 44 |
+
data = await audio.read()
|
| 45 |
+
if not data:
|
| 46 |
+
raise HTTPException(status_code=400, detail="Audio file is empty.")
|
| 47 |
+
mimetype = audio.content_type or "audio/wav"
|
| 48 |
+
result = await transcribe_audio(data, mimetype=mimetype)
|
| 49 |
+
return JSONResponse(content=result)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.post("/tts")
|
| 53 |
+
async def text_to_speech(req: TTSRequest) -> StreamingResponse:
|
| 54 |
+
if not req.text.strip():
|
| 55 |
+
raise HTTPException(status_code=400, detail="text must not be empty.")
|
| 56 |
+
return StreamingResponse(
|
| 57 |
+
synthesize_stream(req.text),
|
| 58 |
+
media_type="audio/pcm",
|
| 59 |
+
headers={
|
| 60 |
+
"X-Sample-Rate": str(SAMPLE_RATE),
|
| 61 |
+
"X-Encoding": "pcm_s16le",
|
| 62 |
+
"X-Channels": "1",
|
| 63 |
+
},
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
@app.websocket("/ws/voice")
|
| 68 |
async def voice_ws(ws: WebSocket) -> None:
|
| 69 |
await ws.accept()
|
pyproject.toml
CHANGED
|
@@ -11,6 +11,7 @@ dependencies = [
|
|
| 11 |
"assemblyai==0.33.0",
|
| 12 |
"cartesia==1.3.1",
|
| 13 |
"deepgram-sdk>=6.1.1",
|
|
|
|
| 14 |
]
|
| 15 |
|
| 16 |
[project.optional-dependencies]
|
|
|
|
| 11 |
"assemblyai==0.33.0",
|
| 12 |
"cartesia==1.3.1",
|
| 13 |
"deepgram-sdk>=6.1.1",
|
| 14 |
+
"python-multipart>=0.0.26",
|
| 15 |
]
|
| 16 |
|
| 17 |
[project.optional-dependencies]
|
src/stt/deepgram_rest.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from deepgram import AsyncDeepgramClient
|
| 3 |
+
from src.config import DEEPGRAM_API_KEY, DEEPGRAM_LANGUAGE
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
async def transcribe_audio(data: bytes, mimetype: str = "audio/wav") -> dict:
|
| 9 |
+
"""
|
| 10 |
+
Transcribes a full audio file using Deepgram pre-recorded API.
|
| 11 |
+
Returns dict with 'text' (full transcript) and 'duration' (seconds).
|
| 12 |
+
"""
|
| 13 |
+
client = AsyncDeepgramClient(api_key=DEEPGRAM_API_KEY)
|
| 14 |
+
response = await client.listen.v1.media.transcribe_file(
|
| 15 |
+
request=data,
|
| 16 |
+
model="nova-2",
|
| 17 |
+
language=DEEPGRAM_LANGUAGE,
|
| 18 |
+
smart_format=True,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
transcript = response.results.channels[0].alternatives[0].transcript
|
| 23 |
+
except (AttributeError, IndexError):
|
| 24 |
+
transcript = ""
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
duration = response.metadata.duration
|
| 28 |
+
except AttributeError:
|
| 29 |
+
duration = None
|
| 30 |
+
|
| 31 |
+
logger.info("STT transcript (%ss): %s", duration, transcript[:80])
|
| 32 |
+
return {"text": transcript, "language": DEEPGRAM_LANGUAGE, "duration": duration}
|
uv.lock
CHANGED
|
@@ -876,6 +876,15 @@ wheels = [
|
|
| 876 |
{ url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
|
| 877 |
]
|
| 878 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 879 |
[[package]]
|
| 880 |
name = "pyyaml"
|
| 881 |
version = "6.0.3"
|
|
@@ -1070,6 +1079,7 @@ dependencies = [
|
|
| 1070 |
{ name = "fastapi" },
|
| 1071 |
{ name = "httpx" },
|
| 1072 |
{ name = "python-dotenv" },
|
|
|
|
| 1073 |
{ name = "uvicorn", extra = ["standard"] },
|
| 1074 |
{ name = "websockets" },
|
| 1075 |
]
|
|
@@ -1087,6 +1097,7 @@ requires-dist = [
|
|
| 1087 |
{ name = "fastapi", specifier = "==0.115.0" },
|
| 1088 |
{ name = "httpx", specifier = "==0.27.2" },
|
| 1089 |
{ name = "python-dotenv", specifier = "==1.0.1" },
|
|
|
|
| 1090 |
{ name = "uvicorn", extras = ["standard"], specifier = "==0.30.6" },
|
| 1091 |
{ name = "websockets", specifier = "==13.1" },
|
| 1092 |
{ name = "websockets", marker = "extra == 'dev'", specifier = "==13.1" },
|
|
|
|
| 876 |
{ url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
|
| 877 |
]
|
| 878 |
|
| 879 |
+
[[package]]
|
| 880 |
+
name = "python-multipart"
|
| 881 |
+
version = "0.0.26"
|
| 882 |
+
source = { registry = "https://pypi.org/simple" }
|
| 883 |
+
sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" }
|
| 884 |
+
wheels = [
|
| 885 |
+
{ url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" },
|
| 886 |
+
]
|
| 887 |
+
|
| 888 |
[[package]]
|
| 889 |
name = "pyyaml"
|
| 890 |
version = "6.0.3"
|
|
|
|
| 1079 |
{ name = "fastapi" },
|
| 1080 |
{ name = "httpx" },
|
| 1081 |
{ name = "python-dotenv" },
|
| 1082 |
+
{ name = "python-multipart" },
|
| 1083 |
{ name = "uvicorn", extra = ["standard"] },
|
| 1084 |
{ name = "websockets" },
|
| 1085 |
]
|
|
|
|
| 1097 |
{ name = "fastapi", specifier = "==0.115.0" },
|
| 1098 |
{ name = "httpx", specifier = "==0.27.2" },
|
| 1099 |
{ name = "python-dotenv", specifier = "==1.0.1" },
|
| 1100 |
+
{ name = "python-multipart", specifier = ">=0.0.26" },
|
| 1101 |
{ name = "uvicorn", extras = ["standard"], specifier = "==0.30.6" },
|
| 1102 |
{ name = "websockets", specifier = "==13.1" },
|
| 1103 |
{ name = "websockets", marker = "extra == 'dev'", specifier = "==13.1" },
|