Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from typing import Optional | |
| from fastapi import FastAPI, Request, UploadFile, File, Form, Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import JSONResponse | |
| app = FastAPI() | |
| # CORS: keep wide for dev; you can restrict origins later | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["GET", "POST", "OPTIONS", "HEAD"], | |
| allow_headers=["*"], | |
| expose_headers=["*"], | |
| ) | |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
| REALTIME_SESSION_URL = "https://api.openai.com/v1/realtime/sessions" | |
| AUDIO_TRANSCRIBE_URL = "https://api.openai.com/v1/audio/transcriptions" | |
| # defaults (you can tune these) | |
| DEFAULT_REALTIME_MODEL = "gpt-realtime" | |
| DEFAULT_VOICE = "verse" | |
| # New STT defaults — fast + cheap; switch to gpt-4o-transcribe for peak accuracy | |
| DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" | |
| # ---------- helpers ---------- | |
| def _json_err(msg: str, code: int = 500): | |
| return JSONResponse( | |
| status_code=code, | |
| content={"error": msg}, | |
| headers={"Access-Control-Allow-Origin": "*", "Content-Type": "application/json"}, | |
| ) | |
| def _auth_headers(beta_realtime: bool = False): | |
| h = { | |
| "Authorization": f"Bearer {OPENAI_API_KEY}", | |
| } | |
| if beta_realtime: | |
| # required for Realtime session creation | |
| h["OpenAI-Beta"] = "realtime=v1" | |
| return h | |
| # ---------- health ---------- | |
| def health(): | |
| return JSONResponse({"status": "ok"}, headers={"Access-Control-Allow-Origin": "*"}) | |
| # ---------- realtime ephemeral ---------- | |
| def mint_ephemeral(model: str = DEFAULT_REALTIME_MODEL, voice: str = DEFAULT_VOICE): | |
| if not OPENAI_API_KEY: | |
| return _json_err("OPENAI_API_KEY not set in environment", 500) | |
| try: | |
| r = requests.post( | |
| REALTIME_SESSION_URL, | |
| headers={**_auth_headers(beta_realtime=True), "Content-Type": "application/json"}, | |
| json={"model": model, "voice": voice}, | |
| timeout=15, | |
| ) | |
| r.raise_for_status() | |
| return JSONResponse( | |
| status_code=200, | |
| content=r.json(), | |
| headers={"Access-Control-Allow-Origin": "*", "Content-Type": "application/json"}, | |
| ) | |
| except Exception as e: | |
| return _json_err(str(e), 500) | |
| def ephemeral_get(model: str = DEFAULT_REALTIME_MODEL, voice: str = DEFAULT_VOICE): | |
| return mint_ephemeral(model, voice) | |
| async def ephemeral_post(request: Request): | |
| try: | |
| data = await request.json() | |
| model = data.get("model", DEFAULT_REALTIME_MODEL) | |
| voice = data.get("voice", DEFAULT_VOICE) | |
| except Exception: | |
| model, voice = DEFAULT_REALTIME_MODEL, DEFAULT_VOICE | |
| return mint_ephemeral(model, voice) | |
| # Catch-all (helps when callers accidentally hit "/" with signed proxy params) | |
| async def catch_all(request: Request, _path: str = ""): | |
| # Serve ephemeral token everywhere except the explicit /transcribe path | |
| if request.url.path.startswith("/transcribe"): | |
| return JSONResponse( | |
| {"error": "use POST /transcribe for audio"}, status_code=405, | |
| headers={"Access-Control-Allow-Origin": "*"} | |
| ) | |
| if request.method == "OPTIONS": | |
| return Response( | |
| status_code=204, | |
| headers={ | |
| "Access-Control-Allow-Origin": "*", | |
| "Access-Control-Allow-Methods": "GET, POST, OPTIONS, HEAD", | |
| "Access-Control-Allow-Headers": "*", | |
| }, | |
| ) | |
| # default: mint realtime token (handy for clients that strip paths) | |
| return mint_ephemeral(DEFAULT_REALTIME_MODEL, DEFAULT_VOICE) | |
| # ---------- NEW: speech-to-text via OpenAI ---------- | |
| async def transcribe( | |
| file: UploadFile = File(..., description="Audio file (wav/mp3/m4a/webm/ogg)"), | |
| model: str = Form(DEFAULT_STT_MODEL), | |
| language: Optional[str] = Form(None), | |
| response_format: str = Form("json"), # "json" | "text" | "srt" | "verbose_json" | "vtt" (model dependent) | |
| ): | |
| """ | |
| Proxy to OpenAI audio/transcriptions. | |
| - Default model: gpt-4o-mini-transcribe (fast). Use gpt-4o-transcribe for max accuracy. | |
| - Send multipart/form-data with 'file' plus optional fields. | |
| """ | |
| if not OPENAI_API_KEY: | |
| return _json_err("OPENAI_API_KEY not set in environment", 500) | |
| try: | |
| # read bytes once | |
| data_bytes = await file.read() | |
| files = { | |
| "file": (file.filename or "audio", data_bytes, file.content_type or "application/octet-stream") | |
| } | |
| form = {"model": model} | |
| if language: | |
| form["language"] = language | |
| if response_format: | |
| form["response_format"] = response_format | |
| r = requests.post( | |
| AUDIO_TRANSCRIBE_URL, | |
| headers=_auth_headers(), | |
| files=files, | |
| data=form, # multipart form fields | |
| timeout=60, | |
| ) | |
| # If text/plain was requested, forward as text response | |
| if response_format == "text": | |
| return Response(content=r.text, media_type="text/plain", headers={"Access-Control-Allow-Origin": "*"}) | |
| # Otherwise assume JSON or SRT/VTT handled as text but wrapped in JSON for consistency | |
| try: | |
| r.raise_for_status() | |
| ct = r.headers.get("content-type", "") | |
| if "application/json" in ct: | |
| return JSONResponse(r.json(), headers={"Access-Control-Allow-Origin": "*"}) | |
| # Non-JSON payloads (srt, vtt) — wrap as {text: "..."} | |
| return JSONResponse({"text": r.text}, headers={"Access-Control-Allow-Origin": "*"}) | |
| except Exception: | |
| # bubble up OpenAI's error text | |
| return _json_err(r.text or "Transcription failed", r.status_code if r.status_code else 500) | |
| except Exception as e: | |
| return _json_err(str(e), 500) | |