Spaces:

tranquilTrill
/

speech2textWithWhisper

Sleeping

File size: 2,769 Bytes


from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
import uvicorn
from fastapi.middleware.cors import CORSMiddleware
import whisper
import shutil
import os
import moviepy.editor as mp
import uuid

app = FastAPI(swagger_ui_parameters={"syntaxHighlight": {"theme": "obsidian"}})
origins = [ "*"]
app.add_middleware(CORSMiddleware, allow_origins=origins,allow_credentials=True,allow_methods=["*"], allow_headers=["*"])
model = whisper.load_model("base")

def transcribe_with_whisper(fpath):   
    try:
        transcription = model.transcribe(fpath)
        result = transcription["text"]   
        print("whisper result:")
        print(result)
        return result
    except Exception as e:
        return str(e)

@app.post("/transcribe-audio")
async def transcribe(file: UploadFile = File(...)):  
    if not file:
        return {"text": "No file sent"}   
    
    try:
        file_location = f"newfile.wav"  
        with open(file_location, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)  
        result = transcribe_with_whisper(file_location)   
        os.remove(file_location) 
        return {"text": result}   

    except Exception as e:
        return {"text" : str(e)}       


#region transcribe video

@app.post("/transcribe-video")
async def transcribe_video(file: UploadFile = File(...)):
    # Create temporary paths   
    temp_video_path = f"{uuid.uuid4()}_{file.filename}"
    temp_audio_path = temp_video_path.rsplit(".", 1)[0] + ".wav"

    # Save uploaded file
    with open(temp_video_path, "wb") as f:
        content = await file.read()
        f.write(content)

    try:
        # Extract and transcribe
        extract_audio_from_video(temp_video_path, temp_audio_path)
        transcript = transcribe_audio_to_text(temp_audio_path)

        return JSONResponse(content={
            "video": file.filename,
            "transcript": transcript
        })

    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})

    finally:
        # Cleanup
        if os.path.exists(temp_video_path):
            os.remove(temp_video_path)
        if os.path.exists(temp_audio_path):
            os.remove(temp_audio_path)

def extract_audio_from_video(video_path: str, audio_path: str):
    clip = mp.VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path)

def transcribe_audio_to_text(audio_path: str, model_size: str = "base") -> str:
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path)
    transcript = "\n".join([seg["text"].strip() for seg in result["segments"]])
    return transcript
#endregion transcribe video


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)