Spaces:

leekwoon
/

Whisper-FastAPI

Sleeping

App Files Files Community

Whisper-FastAPI / main.py

dahyedahye

Add application file

6bac6fb over 1 year ago

raw

history blame

4.18 kB

	import os
	import shutil
	from fastapi import FastAPI, File, UploadFile, Form
	from fastapi.responses import FileResponse, JSONResponse
	from typing import Optional
	from modules.whisper.whisper_factory import WhisperFactory
	from modules.whisper.whisper_parameter import WhisperParameters

	app = FastAPI()

	# Initialize Whisper inference engine
	whisper_inf = WhisperFactory.create_whisper_inference(
	whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
	whisper_model_dir=os.path.join("models", "Whisper"),
	faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
	insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
	output_dir=os.path.join("outputs"),
	)

	@app.post("/transcribe/")
	async def transcribe_video(
	file: UploadFile = File(...),
	model_size: str = Form("large-v2"),
	language: str = Form("en"),
	translate: bool = Form(False),
	file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt"
	add_timestamp: bool = Form(True)
	):
	"""
	Upload a video/audio file and get the generated subtitle file as a response.
	"""
	try:
	# Create temporary directories
	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)

	# Save the uploaded file temporarily
	input_file_path = os.path.join(temp_dir, file.filename)
	with open(input_file_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	# Prepare whisper parameters
	whisper_params = WhisperParameters(
	model_size=model_size,
	lang=language,
	is_translate=translate,
	beam_size=5,
	log_prob_threshold=-1.0,
	no_speech_threshold=0.6,
	compute_type="float16", # or "int8_float16", etc.
	best_of=5,
	patience=1.0,
	condition_on_previous_text=True,
	initial_prompt=None,
	temperature=0.0,
	compression_ratio_threshold=2.4,
	vad_filter=False,
	threshold=0.5,
	min_speech_duration_ms=250,
	max_speech_duration_s=9999,
	min_silence_duration_ms=2000,
	speech_pad_ms=400,
	chunk_length_s=None,
	batch_size=None,
	is_diarize=False,
	hf_token=None,
	diarization_device=None,
	length_penalty=1.0,
	repetition_penalty=1.0,
	no_repeat_ngram_size=0,
	prefix=None,
	suppress_blank=True,
	suppress_tokens="[-1]",
	max_initial_timestamp=1.0,
	word_timestamps=False,
	prepend_punctuations="\"'“¿([{-",
	append_punctuations="\"'.。,，!！?？:：”)]}、",
	max_new_tokens=None,
	chunk_length=None,
	hallucination_silence_threshold=None,
	hotwords=None,
	language_detection_threshold=None,
	language_detection_segments=1,
	prompt_reset_on_temperature=0.5
	)

	# Prepare params and whisper parameters as a single list
	params = [input_file_path, "", file_format, add_timestamp]

	# Transcribe the file
	result_str, result_files = whisper_inf.transcribe_file(
	*params, # Expand the params list
	*whisper_params.as_list() # Expand whisper_params as individual arguments
	)

	# Check if transcription was successful
	if not result_files:
	return JSONResponse(status_code=500, content={"message": "Transcription failed."})

	# Return the first result file
	output_file_path = result_files[0]
	return FileResponse(
	path=output_file_path,
	filename=os.path.basename(output_file_path),
	media_type='application/octet-stream'
	)
	except Exception as e:
	return JSONResponse(status_code=500, content={"message": str(e)})
	finally:
	# Clean up temporary files
	if os.path.exists(input_file_path):
	os.remove(input_file_path)