from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse import uvicorn from fastapi.middleware.cors import CORSMiddleware import whisper import shutil import os import moviepy.editor as mp import uuid app = FastAPI(swagger_ui_parameters={"syntaxHighlight": {"theme": "obsidian"}}) origins = [ "*"] app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True,allow_methods=["*"], allow_headers=["*"]) model = whisper.load_model("base") def transcribe_with_whisper(fpath): try: transcription = model.transcribe(fpath) result = transcription["text"] print("whisper result:") print(result) return result except Exception as e: return str(e) @app.post("/transcribe-audio") async def transcribe(file: UploadFile = File(...)): if not file: return {"text": "No file sent"} try: file_location = f"newfile.wav" with open(file_location, "wb") as buffer: shutil.copyfileobj(file.file, buffer) result = transcribe_with_whisper(file_location) os.remove(file_location) return {"text": result} except Exception as e: return {"text" : str(e)} #region transcribe video @app.post("/transcribe-video") async def transcribe_video(file: UploadFile = File(...)): # Create temporary paths temp_video_path = f"{uuid.uuid4()}_{file.filename}" temp_audio_path = temp_video_path.rsplit(".", 1)[0] + ".wav" # Save uploaded file with open(temp_video_path, "wb") as f: content = await file.read() f.write(content) try: # Extract and transcribe extract_audio_from_video(temp_video_path, temp_audio_path) transcript = transcribe_audio_to_text(temp_audio_path) return JSONResponse(content={ "video": file.filename, "transcript": transcript }) except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)}) finally: # Cleanup if os.path.exists(temp_video_path): os.remove(temp_video_path) if os.path.exists(temp_audio_path): os.remove(temp_audio_path) def extract_audio_from_video(video_path: str, audio_path: str): clip = mp.VideoFileClip(video_path) clip.audio.write_audiofile(audio_path) def transcribe_audio_to_text(audio_path: str, model_size: str = "base") -> str: model = whisper.load_model(model_size) result = model.transcribe(audio_path) transcript = "\n".join([seg["text"].strip() for seg in result["segments"]]) return transcript #endregion transcribe video if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)