ishaq101 commited on
Commit
aebb7d4
·
1 Parent(s): dd0dc33

[NOTICKET] feat: endpoint stt and tts

Browse files
Files changed (4) hide show
  1. main.py +35 -3
  2. pyproject.toml +1 -0
  3. src/stt/deepgram_rest.py +32 -0
  4. uv.lock +11 -0
main.py CHANGED
@@ -1,10 +1,13 @@
1
  import json
2
  import logging
3
  import uvicorn
4
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect
5
- from fastapi.responses import JSONResponse
 
6
  from src.pipeline import EchoPipeline
7
- from src.config import DEEPGRAM_API_KEY, CARTESIA_API_KEY, CARTESIA_VOICE_ID
 
 
8
 
9
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
10
  logger = logging.getLogger(__name__)
@@ -32,6 +35,35 @@ async def health() -> JSONResponse:
32
  return JSONResponse(status_code=200 if all_ready else 503, content=body)
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  @app.websocket("/ws/voice")
36
  async def voice_ws(ws: WebSocket) -> None:
37
  await ws.accept()
 
1
  import json
2
  import logging
3
  import uvicorn
4
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, File, UploadFile, HTTPException
5
+ from fastapi.responses import JSONResponse, StreamingResponse
6
+ from pydantic import BaseModel
7
  from src.pipeline import EchoPipeline
8
+ from src.config import DEEPGRAM_API_KEY, CARTESIA_API_KEY, CARTESIA_VOICE_ID, SAMPLE_RATE
9
+ from src.stt.deepgram_rest import transcribe_audio
10
+ from src.tts.cartesia_client import synthesize_stream
11
 
12
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
13
  logger = logging.getLogger(__name__)
 
35
  return JSONResponse(status_code=200 if all_ready else 503, content=body)
36
 
37
 
38
+ class TTSRequest(BaseModel):
39
+ text: str
40
+
41
+
42
+ @app.post("/stt")
43
+ async def speech_to_text(audio: UploadFile = File(...)) -> JSONResponse:
44
+ data = await audio.read()
45
+ if not data:
46
+ raise HTTPException(status_code=400, detail="Audio file is empty.")
47
+ mimetype = audio.content_type or "audio/wav"
48
+ result = await transcribe_audio(data, mimetype=mimetype)
49
+ return JSONResponse(content=result)
50
+
51
+
52
+ @app.post("/tts")
53
+ async def text_to_speech(req: TTSRequest) -> StreamingResponse:
54
+ if not req.text.strip():
55
+ raise HTTPException(status_code=400, detail="text must not be empty.")
56
+ return StreamingResponse(
57
+ synthesize_stream(req.text),
58
+ media_type="audio/pcm",
59
+ headers={
60
+ "X-Sample-Rate": str(SAMPLE_RATE),
61
+ "X-Encoding": "pcm_s16le",
62
+ "X-Channels": "1",
63
+ },
64
+ )
65
+
66
+
67
  @app.websocket("/ws/voice")
68
  async def voice_ws(ws: WebSocket) -> None:
69
  await ws.accept()
pyproject.toml CHANGED
@@ -11,6 +11,7 @@ dependencies = [
11
  "assemblyai==0.33.0",
12
  "cartesia==1.3.1",
13
  "deepgram-sdk>=6.1.1",
 
14
  ]
15
 
16
  [project.optional-dependencies]
 
11
  "assemblyai==0.33.0",
12
  "cartesia==1.3.1",
13
  "deepgram-sdk>=6.1.1",
14
+ "python-multipart>=0.0.26",
15
  ]
16
 
17
  [project.optional-dependencies]
src/stt/deepgram_rest.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from deepgram import AsyncDeepgramClient
3
+ from src.config import DEEPGRAM_API_KEY, DEEPGRAM_LANGUAGE
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ async def transcribe_audio(data: bytes, mimetype: str = "audio/wav") -> dict:
9
+ """
10
+ Transcribes a full audio file using Deepgram pre-recorded API.
11
+ Returns dict with 'text' (full transcript) and 'duration' (seconds).
12
+ """
13
+ client = AsyncDeepgramClient(api_key=DEEPGRAM_API_KEY)
14
+ response = await client.listen.v1.media.transcribe_file(
15
+ request=data,
16
+ model="nova-2",
17
+ language=DEEPGRAM_LANGUAGE,
18
+ smart_format=True,
19
+ )
20
+
21
+ try:
22
+ transcript = response.results.channels[0].alternatives[0].transcript
23
+ except (AttributeError, IndexError):
24
+ transcript = ""
25
+
26
+ try:
27
+ duration = response.metadata.duration
28
+ except AttributeError:
29
+ duration = None
30
+
31
+ logger.info("STT transcript (%ss): %s", duration, transcript[:80])
32
+ return {"text": transcript, "language": DEEPGRAM_LANGUAGE, "duration": duration}
uv.lock CHANGED
@@ -876,6 +876,15 @@ wheels = [
876
  { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
877
  ]
878
 
 
 
 
 
 
 
 
 
 
879
  [[package]]
880
  name = "pyyaml"
881
  version = "6.0.3"
@@ -1070,6 +1079,7 @@ dependencies = [
1070
  { name = "fastapi" },
1071
  { name = "httpx" },
1072
  { name = "python-dotenv" },
 
1073
  { name = "uvicorn", extra = ["standard"] },
1074
  { name = "websockets" },
1075
  ]
@@ -1087,6 +1097,7 @@ requires-dist = [
1087
  { name = "fastapi", specifier = "==0.115.0" },
1088
  { name = "httpx", specifier = "==0.27.2" },
1089
  { name = "python-dotenv", specifier = "==1.0.1" },
 
1090
  { name = "uvicorn", extras = ["standard"], specifier = "==0.30.6" },
1091
  { name = "websockets", specifier = "==13.1" },
1092
  { name = "websockets", marker = "extra == 'dev'", specifier = "==13.1" },
 
876
  { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
877
  ]
878
 
879
+ [[package]]
880
+ name = "python-multipart"
881
+ version = "0.0.26"
882
+ source = { registry = "https://pypi.org/simple" }
883
+ sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" }
884
+ wheels = [
885
+ { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" },
886
+ ]
887
+
888
  [[package]]
889
  name = "pyyaml"
890
  version = "6.0.3"
 
1079
  { name = "fastapi" },
1080
  { name = "httpx" },
1081
  { name = "python-dotenv" },
1082
+ { name = "python-multipart" },
1083
  { name = "uvicorn", extra = ["standard"] },
1084
  { name = "websockets" },
1085
  ]
 
1097
  { name = "fastapi", specifier = "==0.115.0" },
1098
  { name = "httpx", specifier = "==0.27.2" },
1099
  { name = "python-dotenv", specifier = "==1.0.1" },
1100
+ { name = "python-multipart", specifier = ">=0.0.26" },
1101
  { name = "uvicorn", extras = ["standard"], specifier = "==0.30.6" },
1102
  { name = "websockets", specifier = "==13.1" },
1103
  { name = "websockets", marker = "extra == 'dev'", specifier = "==13.1" },