Spaces:

leekwoon
/

Whisper-FastAPI

Sleeping

App Files Files Community

Whisper-FastAPI / main.py

dahyedahye

Add application file

d6817dc over 1 year ago

raw

history blame contribute delete

4.29 kB

	import os
	import shutil
	from fastapi import FastAPI, File, UploadFile, Form
	from fastapi.responses import FileResponse, JSONResponse
	from typing import Optional
	from modules.whisper.whisper_factory import WhisperFactory
	from modules.whisper.whisper_parameter import WhisperParameters

	app = FastAPI()

	# Initialize Whisper inference engine
	whisper_inf = WhisperFactory.create_whisper_inference(
	whisper_type="faster-whisper", # Choose between "whisper", "faster-whisper", "insanely-fast-whisper"
	whisper_model_dir=os.path.join("models", "Whisper"),
	faster_whisper_model_dir=os.path.join("models", "Whisper", "faster-whisper"),
	insanely_fast_whisper_model_dir=os.path.join("models", "Whisper", "insanely-fast-whisper"),
	output_dir=os.path.join("outputs"),
	)

	@app.post("/transcribe/")
	async def transcribe_video(
	file: UploadFile = File(...),
	model_size: str = Form("large-v2"),
	language: str = Form("en"),
	translate: bool = Form(False),
	file_format: str = Form("SRT"), # Options: "SRT", "WebVTT", "txt"
	add_timestamp: bool = Form(True)
	):
	"""
	Upload a video/audio file and get the generated subtitle file as a response.
	"""
	try:
	# Create temporary directories
	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)

	# Save the uploaded file temporarily
	input_file_path = os.path.join(temp_dir, file.filename)
	with open(input_file_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	# Prepare whisper parameters
	whisper_params = WhisperParameters(
	model_size=model_size,
	lang=language,
	is_translate=translate,
	beam_size=5,
	log_prob_threshold=-1.0,
	no_speech_threshold=0.6,
	compute_type="float16", # or "int8_float16", etc.
	best_of=5,
	patience=1.0,
	condition_on_previous_text=True,
	initial_prompt=None,
	temperature=0.0,
	compression_ratio_threshold=2.4,
	vad_filter=False,
	threshold=0.5,
	min_speech_duration_ms=250,
	max_speech_duration_s=9999,
	min_silence_duration_ms=2000,
	speech_pad_ms=400,
	chunk_length_s=None,
	batch_size=None,
	is_diarize=False,
	hf_token=None,
	diarization_device=None,
	length_penalty=1.0,
	repetition_penalty=1.0,
	no_repeat_ngram_size=0,
	prefix=None,
	suppress_blank=True,
	suppress_tokens="[-1]",
	max_initial_timestamp=1.0,
	word_timestamps=False,
	prepend_punctuations="\"'“¿([{-",
	append_punctuations="\"'.。,，!！?？:：”)]}、",
	max_new_tokens=None,
	chunk_length=None,
	hallucination_silence_threshold=None,
	hotwords=None,
	language_detection_threshold=None,
	language_detection_segments=1,
	prompt_reset_on_temperature=0.5
	)

	# Prepare params and whisper parameters as a single list
	params = [input_file_path, "", file_format, add_timestamp]

	# Transcribe the file
	result_str, result_files = whisper_inf.transcribe_file(
	files=[input_file_path],
	input_folder_path="",
	file_format=file_format,
	add_timestamp=add_timestamp,
	*whisper_params.as_list() # Expand whisper_params as individual arguments
	)

	# Check if transcription was successful
	if not result_files:
	return JSONResponse(status_code=500, content={"message": "Transcription failed."})

	# Return the first result file
	output_file_path = result_files[0]
	return FileResponse(
	path=output_file_path,
	filename=os.path.basename(output_file_path),
	media_type='application/octet-stream'
	)
	except Exception as e:
	return JSONResponse(status_code=500, content={"message": str(e)})
	finally:
	# Clean up temporary files
	if os.path.exists(input_file_path):
	os.remove(input_file_path)