import os import requests from typing import Optional from fastapi import FastAPI, Request, UploadFile, File, Form, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse app = FastAPI() # CORS: keep wide for dev; you can restrict origins later app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["GET", "POST", "OPTIONS", "HEAD"], allow_headers=["*"], expose_headers=["*"], ) OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") REALTIME_SESSION_URL = "https://api.openai.com/v1/realtime/sessions" AUDIO_TRANSCRIBE_URL = "https://api.openai.com/v1/audio/transcriptions" # defaults (you can tune these) DEFAULT_REALTIME_MODEL = "gpt-realtime" DEFAULT_VOICE = "verse" # New STT defaults — fast + cheap; switch to gpt-4o-transcribe for peak accuracy DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" # ---------- helpers ---------- def _json_err(msg: str, code: int = 500): return JSONResponse( status_code=code, content={"error": msg}, headers={"Access-Control-Allow-Origin": "*", "Content-Type": "application/json"}, ) def _auth_headers(beta_realtime: bool = False): h = { "Authorization": f"Bearer {OPENAI_API_KEY}", } if beta_realtime: # required for Realtime session creation h["OpenAI-Beta"] = "realtime=v1" return h # ---------- health ---------- @app.get("/health") @app.get("/health/") def health(): return JSONResponse({"status": "ok"}, headers={"Access-Control-Allow-Origin": "*"}) # ---------- realtime ephemeral ---------- def mint_ephemeral(model: str = DEFAULT_REALTIME_MODEL, voice: str = DEFAULT_VOICE): if not OPENAI_API_KEY: return _json_err("OPENAI_API_KEY not set in environment", 500) try: r = requests.post( REALTIME_SESSION_URL, headers={**_auth_headers(beta_realtime=True), "Content-Type": "application/json"}, json={"model": model, "voice": voice}, timeout=15, ) r.raise_for_status() return JSONResponse( status_code=200, content=r.json(), headers={"Access-Control-Allow-Origin": "*", "Content-Type": "application/json"}, ) except Exception as e: return _json_err(str(e), 500) @app.get("/ephemeral") @app.get("/ephemeral/") def ephemeral_get(model: str = DEFAULT_REALTIME_MODEL, voice: str = DEFAULT_VOICE): return mint_ephemeral(model, voice) @app.post("/ephemeral") @app.post("/ephemeral/") async def ephemeral_post(request: Request): try: data = await request.json() model = data.get("model", DEFAULT_REALTIME_MODEL) voice = data.get("voice", DEFAULT_VOICE) except Exception: model, voice = DEFAULT_REALTIME_MODEL, DEFAULT_VOICE return mint_ephemeral(model, voice) # Catch-all (helps when callers accidentally hit "/" with signed proxy params) @app.api_route("/", methods=["GET", "POST", "OPTIONS", "HEAD"]) @app.api_route("/{_path:path}", methods=["GET", "POST", "OPTIONS", "HEAD"]) async def catch_all(request: Request, _path: str = ""): # Serve ephemeral token everywhere except the explicit /transcribe path if request.url.path.startswith("/transcribe"): return JSONResponse( {"error": "use POST /transcribe for audio"}, status_code=405, headers={"Access-Control-Allow-Origin": "*"} ) if request.method == "OPTIONS": return Response( status_code=204, headers={ "Access-Control-Allow-Origin": "*", "Access-Control-Allow-Methods": "GET, POST, OPTIONS, HEAD", "Access-Control-Allow-Headers": "*", }, ) # default: mint realtime token (handy for clients that strip paths) return mint_ephemeral(DEFAULT_REALTIME_MODEL, DEFAULT_VOICE) # ---------- NEW: speech-to-text via OpenAI ---------- @app.post("/transcribe") async def transcribe( file: UploadFile = File(..., description="Audio file (wav/mp3/m4a/webm/ogg)"), model: str = Form(DEFAULT_STT_MODEL), language: Optional[str] = Form(None), response_format: str = Form("json"), # "json" | "text" | "srt" | "verbose_json" | "vtt" (model dependent) ): """ Proxy to OpenAI audio/transcriptions. - Default model: gpt-4o-mini-transcribe (fast). Use gpt-4o-transcribe for max accuracy. - Send multipart/form-data with 'file' plus optional fields. """ if not OPENAI_API_KEY: return _json_err("OPENAI_API_KEY not set in environment", 500) try: # read bytes once data_bytes = await file.read() files = { "file": (file.filename or "audio", data_bytes, file.content_type or "application/octet-stream") } form = {"model": model} if language: form["language"] = language if response_format: form["response_format"] = response_format r = requests.post( AUDIO_TRANSCRIBE_URL, headers=_auth_headers(), files=files, data=form, # multipart form fields timeout=60, ) # If text/plain was requested, forward as text response if response_format == "text": return Response(content=r.text, media_type="text/plain", headers={"Access-Control-Allow-Origin": "*"}) # Otherwise assume JSON or SRT/VTT handled as text but wrapped in JSON for consistency try: r.raise_for_status() ct = r.headers.get("content-type", "") if "application/json" in ct: return JSONResponse(r.json(), headers={"Access-Control-Allow-Origin": "*"}) # Non-JSON payloads (srt, vtt) — wrap as {text: "..."} return JSONResponse({"text": r.text}, headers={"Access-Control-Allow-Origin": "*"}) except Exception: # bubble up OpenAI's error text return _json_err(r.text or "Transcription failed", r.status_code if r.status_code else 500) except Exception as e: return _json_err(str(e), 500)