Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 29 days ago

Commit

4a9ec15

1 Parent(s): e651eb1

Deploy 92e36db

Browse files

Files changed (11) hide show

app/api/transcribe.py +5 -4
app/api/tts.py +10 -6
app/core/config.py +8 -0
app/main.py +2 -0
app/security/rate_limiter.py +6 -0
app/services/transcriber.py +16 -14
app/services/tts_client.py +41 -2
requirements.txt +3 -1
test_stream.py +28 -0
tests/test_speech_endpoints.py +8 -7
tests/test_transcriber_normalization.py +17 -3

app/api/transcribe.py CHANGED Viewed

@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends, File, Form, HTTPException, Request, Uplo
 from app.models.speech import TranscribeResponse
 from app.security.jwt_auth import verify_jwt
 router = APIRouter()
@@ -22,6 +23,7 @@ _ALLOWED_AUDIO_TYPES: frozenset[str] = frozenset(
 @router.post("")
 async def transcribe_endpoint(
     request: Request,
     audio: Annotated[UploadFile, File(...)],
@@ -44,14 +46,13 @@ async def transcribe_endpoint(
             detail="Unsupported audio format.",
         )
-    audio_bytes = await audio.read()
-    if not audio_bytes:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Audio file is empty.",
         )
-    if len(audio_bytes) > settings.TRANSCRIBE_MAX_UPLOAD_BYTES:
         raise HTTPException(
             status_code=status.HTTP_413_CONTENT_TOO_LARGE,
             detail="Audio file exceeds maximum allowed size.",
@@ -67,7 +68,7 @@ async def transcribe_endpoint(
     transcript = await transcriber.transcribe(
         filename=audio.filename or "audio.webm",
         content_type=content_type,
-        audio_bytes=audio_bytes,
         language=language_code,
     )

 from app.models.speech import TranscribeResponse
 from app.security.jwt_auth import verify_jwt
+from app.security.rate_limiter import transcribe_rate_limit
 router = APIRouter()
 @router.post("")
+@transcribe_rate_limit()
 async def transcribe_endpoint(
     request: Request,
     audio: Annotated[UploadFile, File(...)],
             detail="Unsupported audio format.",
         )
+    if audio.size is None or audio.size == 0:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Audio file is empty.",
         )
+    if audio.size > settings.TRANSCRIBE_MAX_UPLOAD_BYTES:
         raise HTTPException(
             status_code=status.HTTP_413_CONTENT_TOO_LARGE,
             detail="Audio file exceeds maximum allowed size.",
     transcript = await transcriber.transcribe(
         filename=audio.filename or "audio.webm",
         content_type=content_type,
+        audio_file=audio.file,
         language=language_code,
     )

app/api/tts.py CHANGED Viewed

@@ -1,20 +1,22 @@
 from typing import Annotated
 from fastapi import APIRouter, Depends, HTTPException, Request, status
-from fastapi.responses import Response
 from app.models.speech import SynthesizeRequest
 from app.security.jwt_auth import verify_jwt
 router = APIRouter()
 @router.post("")
 async def synthesize_endpoint(
     request: Request,
     payload: SynthesizeRequest,
     _: Annotated[dict, Depends(verify_jwt)],
-) -> Response:
     tts_client = request.app.state.tts_client
     if not tts_client.is_configured:
         raise HTTPException(
@@ -22,8 +24,10 @@ async def synthesize_endpoint(
             detail="TTS service is not configured.",
         )
-    audio_bytes = await tts_client.synthesize(
-        payload.text.strip(),
-        voice=payload.voice.strip().lower(),
     )
-    return Response(content=audio_bytes, media_type="audio/wav")

 from typing import Annotated
 from fastapi import APIRouter, Depends, HTTPException, Request, status
+from fastapi.responses import StreamingResponse
 from app.models.speech import SynthesizeRequest
 from app.security.jwt_auth import verify_jwt
+from app.security.rate_limiter import tts_rate_limit
 router = APIRouter()
 @router.post("")
+@tts_rate_limit()
 async def synthesize_endpoint(
     request: Request,
     payload: SynthesizeRequest,
     _: Annotated[dict, Depends(verify_jwt)],
+) -> StreamingResponse:
     tts_client = request.app.state.tts_client
     if not tts_client.is_configured:
         raise HTTPException(
             detail="TTS service is not configured.",
         )
+    return StreamingResponse(
+        tts_client.synthesize_stream(
+            payload.text.strip(),
+            voice=payload.voice.strip().lower(),
+        ),
+        media_type="audio/wav"
     )

app/core/config.py CHANGED Viewed

@@ -65,6 +65,14 @@ class Settings(BaseSettings):
     # Speech-to-text upload constraints
     TRANSCRIBE_MAX_UPLOAD_BYTES: int = 2 * 1024 * 1024
     TRANSCRIBE_TIMEOUT_SECONDS: float = 25.0
     model_config = SettingsConfigDict(env_file=".env", extra="ignore")

     # Speech-to-text upload constraints
     TRANSCRIBE_MAX_UPLOAD_BYTES: int = 2 * 1024 * 1024
     TRANSCRIBE_TIMEOUT_SECONDS: float = 25.0
+    TRANSCRIBE_DEFAULT_LANGUAGE: str = "en"
+    TRANSCRIBE_REPLACEMENTS: dict[str, str] = {
+        r"\bwalk experience\b": "work experience",
+        r"\btext stack\b": "tech stack",
+        r"\bprofessional sitting\b": "professional setting",
+        r"\btech stocks\b": "tech stack",
+        r"\bwhat tech stack does he\s+used\b": "what tech stack does he use",
+    }
     model_config = SettingsConfigDict(env_file=".env", extra="ignore")

app/main.py CHANGED Viewed

@@ -144,6 +144,8 @@ async def lifespan(app: FastAPI):
         api_key=settings.GROQ_API_KEY or "",
         model=settings.GROQ_TRANSCRIBE_MODEL,
         timeout_seconds=settings.TRANSCRIBE_TIMEOUT_SECONDS,
     )
     app.state.tts_client = TTSClient(
         tts_space_url=settings.TTS_SPACE_URL,

         api_key=settings.GROQ_API_KEY or "",
         model=settings.GROQ_TRANSCRIBE_MODEL,
         timeout_seconds=settings.TRANSCRIBE_TIMEOUT_SECONDS,
+        default_language=settings.TRANSCRIBE_DEFAULT_LANGUAGE,
+        replacements=settings.TRANSCRIBE_REPLACEMENTS,
     )
     app.state.tts_client = TTSClient(
         tts_space_url=settings.TTS_SPACE_URL,

app/security/rate_limiter.py CHANGED Viewed

@@ -18,3 +18,9 @@ async def custom_rate_limit_handler(request: Request, exc: Exception) -> JSONRes
 # Decorator factory chat_rate_limit that applies 20/minute limit.
 def chat_rate_limit() -> Callable:
     return limiter.limit("20/minute")

 # Decorator factory chat_rate_limit that applies 20/minute limit.
 def chat_rate_limit() -> Callable:
     return limiter.limit("20/minute")
+def tts_rate_limit() -> Callable:
+    return limiter.limit("10/minute")
+def transcribe_rate_limit() -> Callable:
+    return limiter.limit("20/minute")

app/services/transcriber.py CHANGED Viewed

@@ -9,19 +9,12 @@ from app.core.exceptions import GenerationError
 _FILLER_PREFIX_RE = re.compile(r"^\s*(uh+|um+|erm+|like|you know|please|hey)\s+", re.IGNORECASE)
 _MULTISPACE_RE = re.compile(r"\s+")
-_TRANSCRIPT_REPLACEMENTS: tuple[tuple[re.Pattern[str], str], ...] = (
-    (re.compile(r"\bwalk experience\b", re.IGNORECASE), "work experience"),
-    (re.compile(r"\btext stack\b", re.IGNORECASE), "tech stack"),
-    (re.compile(r"\bprofessional sitting\b", re.IGNORECASE), "professional setting"),
-    (re.compile(r"\btech stocks\b", re.IGNORECASE), "tech stack"),
-    (re.compile(r"\bwhat tech stack does he\s+used\b", re.IGNORECASE), "what tech stack does he use"),
-)
-def _normalise_transcript_text(text: str) -> str:
     cleaned = text.strip()
     cleaned = _FILLER_PREFIX_RE.sub("", cleaned)
-    for pattern, replacement in _TRANSCRIPT_REPLACEMENTS:
         cleaned = pattern.sub(replacement, cleaned)
     cleaned = _MULTISPACE_RE.sub(" ", cleaned)
     return cleaned.strip()
@@ -33,10 +26,17 @@ class GroqTranscriber:
         api_key: str,
         model: str,
         timeout_seconds: float,
     ) -> None:
         self._client = AsyncGroq(api_key=api_key) if api_key else None
         self._model = model
         self._timeout_seconds = timeout_seconds
     @property
     def is_configured(self) -> bool:
@@ -51,26 +51,28 @@ class GroqTranscriber:
         self,
         filename: str,
         content_type: str,
-        audio_bytes: bytes,
         language: str | None = None,
     ) -> str:
         if not self._client:
             raise GenerationError("Transcriber is not configured with GROQ_API_KEY")
         async def _call() -> str:
             response = await self._client.audio.transcriptions.create(
-                file=(filename, audio_bytes, content_type),
                 model=self._model,
                 temperature=0,
-                language=language,
             )
             text = getattr(response, "text", None)
             if isinstance(text, str) and text.strip():
-                return _normalise_transcript_text(text)
             if isinstance(response, dict):
                 value = response.get("text")
                 if isinstance(value, str) and value.strip():
-                    return _normalise_transcript_text(value)
             raise GenerationError("Transcription response did not contain text")
         try:

 _FILLER_PREFIX_RE = re.compile(r"^\s*(uh+|um+|erm+|like|you know|please|hey)\s+", re.IGNORECASE)
 _MULTISPACE_RE = re.compile(r"\s+")
+def _normalise_transcript_text(text: str, replacements: tuple[tuple[re.Pattern[str], str], ...]) -> str:
     cleaned = text.strip()
     cleaned = _FILLER_PREFIX_RE.sub("", cleaned)
+    for pattern, replacement in replacements:
         cleaned = pattern.sub(replacement, cleaned)
     cleaned = _MULTISPACE_RE.sub(" ", cleaned)
     return cleaned.strip()
         api_key: str,
         model: str,
         timeout_seconds: float,
+        default_language: str = "en",
+        replacements: dict[str, str] | None = None,
     ) -> None:
         self._client = AsyncGroq(api_key=api_key) if api_key else None
         self._model = model
         self._timeout_seconds = timeout_seconds
+        self._default_language = default_language
+        self._replacements = tuple(
+            (re.compile(pattern, re.IGNORECASE), replacement)
+            for pattern, replacement in (replacements or {}).items()
+        )
     @property
     def is_configured(self) -> bool:
         self,
         filename: str,
         content_type: str,
+        audio_file,
         language: str | None = None,
     ) -> str:
         if not self._client:
             raise GenerationError("Transcriber is not configured with GROQ_API_KEY")
+        target_language = language if language else self._default_language
         async def _call() -> str:
             response = await self._client.audio.transcriptions.create(
+                file=(filename, audio_file, content_type),
                 model=self._model,
                 temperature=0,
+                language=target_language,
             )
             text = getattr(response, "text", None)
             if isinstance(text, str) and text.strip():
+                return _normalise_transcript_text(text, self._replacements)
             if isinstance(response, dict):
                 value = response.get("text")
                 if isinstance(value, str) and value.strip():
+                    return _normalise_transcript_text(value, self._replacements)
             raise GenerationError("Transcription response did not contain text")
         try:

app/services/tts_client.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import asyncio
 import httpx
 from app.core.exceptions import GenerationError
@@ -44,3 +42,44 @@ class TTSClient:
             raise
         except Exception as exc:
             raise GenerationError("TTS synthesis failed", context={"error": str(exc)}) from exc

 import asyncio
 import httpx
 from app.core.exceptions import GenerationError
             raise
         except Exception as exc:
             raise GenerationError("TTS synthesis failed", context={"error": str(exc)}) from exc
+    async def synthesize_stream(self, text: str, voice: str = "am_adam"):
+        text = text.strip()
+        if not text:
+            raise GenerationError("TTS request text is empty")
+        loop = asyncio.get_running_loop()
+        queue = asyncio.Queue()
+        def _worker():
+            try:
+                generator = self._pipeline(text, voice=voice, speed=1, split_pattern=r'\n+')
+                for gs, ps, audio in generator:
+                    if audio is not None:
+                        import numpy as np
+                        pcm_audio = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+                        loop.call_soon_threadsafe(queue.put_nowait, pcm_audio)
+            except Exception as e:
+                loop.call_soon_threadsafe(queue.put_nowait, e)
+            finally:
+                loop.call_soon_threadsafe(queue.put_nowait, None)
+        import threading
+        thread = threading.Thread(target=_worker)
+        thread.start()
+        import struct
+        # 44-byte WAV header with 0xFFFFFFFF for sizes (streaming)
+        yield struct.pack('<4sI4s4sIHHIIHH4sI',
+            b'RIFF', 0xFFFFFFFF, b'WAVE',
+            b'fmt ', 16, 1, 1, 24000, 48000, 2, 16,
+            b'data', 0xFFFFFFFF
+        )
+        while True:
+            chunk = await queue.get()
+            if chunk is None:
+                break
+            if isinstance(chunk, Exception):
+                raise GenerationError("TTS synthesis stream failed", context={"error": str(chunk)}) from chunk
+            yield chunk

requirements.txt CHANGED Viewed

@@ -26,4 +26,6 @@ google-genai>=1.0.0
 # fastembed: powers BM25 sparse retrieval (Stage 2). Qdrant/bm25 vocabulary
 # downloads ~5 MB on first use then runs fully local — no GPU, no network at query time.
 fastembed>=0.3.6
-toon_format @ git+https://github.com/toon-format/toon-python.git

 # fastembed: powers BM25 sparse retrieval (Stage 2). Qdrant/bm25 vocabulary
 # downloads ~5 MB on first use then runs fully local — no GPU, no network at query time.
 fastembed>=0.3.6
+toon_format @ git+https://github.com/toon-format/toon-python.git
+kokoro>=0.9.0
+soundfile>=0.13.0

test_stream.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import asyncio
+from kokoro import KPipeline
+import numpy as np
+import struct
+import time
+async def main():
+    pipeline = KPipeline(lang_code='a')
+    generator = pipeline("Hello, this is a test of streaming audio from Kokoro.", voice="am_adam", speed=1, split_pattern=r'\n+')
+    with open("test_stream.wav", "wb") as f:
+        # Write WAV header
+        # chunk_size = 36 + data_size
+        header = struct.pack('<4sI4s4sIHHIIHH4sI',
+            b'RIFF', 0xFFFFFFFF, b'WAVE',
+            b'fmt ', 16, 1, 1, 24000, 48000, 2, 16,
+            b'data', 0xFFFFFFFF
+        )
+        f.write(header)
+        for gs, ps, audio in generator:
+            if audio is not None:
+                print("Got chunk:", len(audio))
+                pcm = (np.clip(audio, -1.0, 1.0) * 32767).astype(np.int16).tobytes()
+                f.write(pcm)
+if __name__ == "__main__":
+    asyncio.run(main())

tests/test_speech_endpoints.py CHANGED Viewed

@@ -10,7 +10,7 @@ def test_transcribe_requires_auth(app_client):
 def test_transcribe_success(app_client, valid_token):
-    async def fake_transcribe(filename, content_type, audio_bytes, language=None):
         await asyncio.sleep(0)
         return "hello from voice"
@@ -46,13 +46,13 @@ def test_tts_requires_auth(app_client):
 def test_tts_success(app_client, valid_token):
     captured: dict[str, str] = {}
-    async def fake_synthesize(text, voice="am_adam"):
         await asyncio.sleep(0)
         captured["text"] = text
         captured["voice"] = voice
-        return b"RIFF....fake"
-    app_client.app.state.tts_client.synthesize = fake_synthesize
     response = app_client.post(
         "/tts",
@@ -62,6 +62,7 @@ def test_tts_success(app_client, valid_token):
     assert response.status_code == 200
     assert response.headers.get("content-type", "").startswith("audio/wav")
     assert response.content == b"RIFF....fake"
     assert captured["text"] == "Hello world"
     assert captured["voice"] == "am_adam"
@@ -70,12 +71,12 @@ def test_tts_success(app_client, valid_token):
 def test_tts_uses_provided_voice(app_client, valid_token):
     captured: dict[str, str] = {}
-    async def fake_synthesize(text, voice="am_adam"):
         await asyncio.sleep(0)
         captured["voice"] = voice
-        return b"RIFF....fake"
-    app_client.app.state.tts_client.synthesize = fake_synthesize
     response = app_client.post(
         "/tts",

 def test_transcribe_success(app_client, valid_token):
+    async def fake_transcribe(filename, content_type, audio_file, language=None):
         await asyncio.sleep(0)
         return "hello from voice"
 def test_tts_success(app_client, valid_token):
     captured: dict[str, str] = {}
+    async def fake_synthesize_stream(text, voice="am_adam"):
         await asyncio.sleep(0)
         captured["text"] = text
         captured["voice"] = voice
+        yield b"RIFF....fake"
+    app_client.app.state.tts_client.synthesize_stream = fake_synthesize_stream
     response = app_client.post(
         "/tts",
     assert response.status_code == 200
     assert response.headers.get("content-type", "").startswith("audio/wav")
+    # StreamingResponse returns chunks, so response.content concatenates them
     assert response.content == b"RIFF....fake"
     assert captured["text"] == "Hello world"
     assert captured["voice"] == "am_adam"
 def test_tts_uses_provided_voice(app_client, valid_token):
     captured: dict[str, str] = {}
+    async def fake_synthesize_stream(text, voice="am_adam"):
         await asyncio.sleep(0)
         captured["voice"] = voice
+        yield b"RIFF....fake"
+    app_client.app.state.tts_client.synthesize_stream = fake_synthesize_stream
     response = app_client.post(
         "/tts",

tests/test_transcriber_normalization.py CHANGED Viewed

@@ -1,15 +1,29 @@
 from app.services.transcriber import _normalise_transcript_text
 def test_normalise_walk_experience_to_work_experience() -> None:
     query = "uh what is his walk experience in a professional setting"
-    assert _normalise_transcript_text(query) == "what is his work experience in a professional setting"
 def test_normalise_text_stack_to_tech_stack() -> None:
-    assert _normalise_transcript_text("what text stack does he use") == "what tech stack does he use"
 def test_keeps_clean_transcript_unchanged() -> None:
     original = "What technologies and skills does he work with?"
-    assert _normalise_transcript_text(original) == original

+import re
+from app.core.config import get_settings
 from app.services.transcriber import _normalise_transcript_text
+def _get_test_replacements():
+    replacements = get_settings().TRANSCRIBE_REPLACEMENTS
+    return tuple(
+        (re.compile(pattern, re.IGNORECASE), replacement)
+        for pattern, replacement in replacements.items()
+    )
 def test_normalise_walk_experience_to_work_experience() -> None:
     query = "uh what is his walk experience in a professional setting"
+    replacements = _get_test_replacements()
+    assert _normalise_transcript_text(query, replacements) == "what is his work experience in a professional setting"
 def test_normalise_text_stack_to_tech_stack() -> None:
+    replacements = _get_test_replacements()
+    assert _normalise_transcript_text("what text stack does he use", replacements) == "what tech stack does he use"
 def test_keeps_clean_transcript_unchanged() -> None:
     original = "What technologies and skills does he work with?"
+    replacements = _get_test_replacements()
+    assert _normalise_transcript_text(original, replacements) == original