from __future__ import annotations import os import tempfile from fastapi import FastAPI, File, Form, HTTPException, UploadFile try: from .config import VoiceRuntimeConfig from .service import process_diarization_only, process_voice except ImportError: # HF flat-root execution fallback from config import VoiceRuntimeConfig from service import process_diarization_only, process_voice app = FastAPI(title="Voice Intelligence Module", version="1.0.0") _ALLOWED = {".wav", ".mp3", ".m4a", ".ogg", ".flac", ".aac", ".mp4", ".mov", ".mkv", ".webm"} def _save_upload_temp(upload: UploadFile) -> str: filename = upload.filename or "input_audio" ext = os.path.splitext(filename)[1].lower() if ext not in _ALLOWED: allowed = ", ".join(sorted(_ALLOWED)) raise HTTPException(status_code=400, detail=f"Unsupported input extension: {ext or 'unknown'}. Allowed: {allowed}") fd, tmp_path = tempfile.mkstemp(prefix="voice-intel-upload-", suffix=ext) os.close(fd) with open(tmp_path, "wb") as f: while True: chunk = upload.file.read(1024 * 1024) if not chunk: break f.write(chunk) return tmp_path @app.get("/health") def health() -> dict: return {"ok": True, "module": "voice-intelligence"} @app.post("/v1/voice/align/trimmed") def align_trimmed( file: UploadFile = File(...), language_hint: str = Form("auto"), silence_threshold_db: float = Form(-40.0), min_silence_sec: float = Form(0.30), keep_padding_sec: float = Form(0.05), analysis_window_ms: int = Form(10), diarization_enabled: str = Form("true"), diarization_min_speakers: int = Form(0), diarization_max_speakers: int = Form(0), ) -> dict: tmp_path = _save_upload_temp(file) config = VoiceRuntimeConfig.from_env() config.silence_threshold_db = silence_threshold_db config.min_silence_sec = min_silence_sec config.keep_padding_sec = keep_padding_sec config.analysis_window_ms = analysis_window_ms config.diarization_enabled = diarization_enabled.strip().lower() in ("1", "true", "yes", "on") config.diarization_min_speakers = diarization_min_speakers config.diarization_max_speakers = diarization_max_speakers try: response = process_voice( input_audio_path=tmp_path, config=config, language_hint=language_hint, trim_silence_enabled=True, include_audio_payload=True, minimal_output=False, ) response["endpoint"] = "/v1/voice/align/trimmed" response["description"] = "Returns word-level data and silence-removed WAV payload." return response except HTTPException: raise except Exception as exc: raise HTTPException(status_code=500, detail=f"trimmed endpoint failed: {exc}") from exc finally: try: os.remove(tmp_path) except OSError: pass @app.post("/v1/voice/align/raw") def align_raw( file: UploadFile = File(...), language_hint: str = Form("auto"), diarization_enabled: str = Form("true"), diarization_min_speakers: int = Form(0), diarization_max_speakers: int = Form(0), ) -> dict: tmp_path = _save_upload_temp(file) config = VoiceRuntimeConfig.from_env() config.diarization_enabled = diarization_enabled.strip().lower() in ("1", "true", "yes", "on") config.diarization_min_speakers = diarization_min_speakers config.diarization_max_speakers = diarization_max_speakers try: response = process_voice( input_audio_path=tmp_path, config=config, language_hint=language_hint, trim_silence_enabled=False, include_audio_payload=False, minimal_output=True, ) response["endpoint"] = "/v1/voice/align/raw" response["description"] = "Returns only required alignment data; input audio remains unchanged." return response except HTTPException: raise except Exception as exc: raise HTTPException(status_code=500, detail=f"raw endpoint failed: {exc}") from exc finally: try: os.remove(tmp_path) except OSError: pass @app.post("/v1/voice/diarization") def diarization_only( file: UploadFile = File(...), diarization_enabled: str = Form("true"), diarization_min_speakers: int = Form(0), diarization_max_speakers: int = Form(0), ) -> dict: tmp_path = _save_upload_temp(file) config = VoiceRuntimeConfig.from_env() config.diarization_enabled = diarization_enabled.strip().lower() in ("1", "true", "yes", "on") config.diarization_min_speakers = diarization_min_speakers config.diarization_max_speakers = diarization_max_speakers try: response = process_diarization_only( input_audio_path=tmp_path, config=config, ) response["endpoint"] = "/v1/voice/diarization" response["description"] = "Returns diarization only (speaker segments + summary), no transcription payload." return response except HTTPException: raise except Exception as exc: raise HTTPException(status_code=500, detail=f"diarization endpoint failed: {exc}") from exc finally: try: os.remove(tmp_path) except OSError: pass if __name__ == "__main__": import uvicorn port = int(os.environ.get("PORT", "8091")) uvicorn.run("core.voice_intelligence.api:app", host="0.0.0.0", port=port, reload=False)