Spaces:

LAP-DEV
/

Demo

Running

App Files Files Community

Demo / modules /whisper /whisper_parameter.py

LAP-DEV

Update modules/whisper/whisper_parameter.py

b1b8785 verified 11 months ago

raw

history blame

15.8 kB

	from dataclasses import dataclass, fields
	import gradio as gr
	from typing import Optional, Dict
	import yaml


	@dataclass
	class WhisperParameters:
	model_size: gr.Dropdown
	lang: gr.Dropdown
	is_translate: gr.Checkbox
	beam_size: gr.Number
	log_prob_threshold: gr.Number
	no_speech_threshold: gr.Number
	compute_type: gr.Dropdown
	best_of: gr.Number
	patience: gr.Number
	condition_on_previous_text: gr.Checkbox
	prompt_reset_on_temperature: gr.Slider
	initial_prompt: gr.Textbox
	temperature: gr.Slider
	compression_ratio_threshold: gr.Number
	vad_filter: gr.Checkbox
	threshold: gr.Slider
	min_speech_duration_ms: gr.Number
	max_speech_duration_s: gr.Number
	min_silence_duration_ms: gr.Number
	speech_pad_ms: gr.Number
	batch_size: gr.Number
	is_diarize: gr.Checkbox
	hf_token: gr.Textbox
	diarization_device: gr.Dropdown
	length_penalty: gr.Number
	repetition_penalty: gr.Number
	no_repeat_ngram_size: gr.Number
	prefix: gr.Textbox
	suppress_blank: gr.Checkbox
	suppress_tokens: gr.Textbox
	max_initial_timestamp: gr.Number
	word_timestamps: gr.Checkbox
	prepend_punctuations: gr.Textbox
	append_punctuations: gr.Textbox
	max_new_tokens: gr.Number
	chunk_length: gr.Number
	hallucination_silence_threshold: gr.Number
	hotwords: gr.Textbox
	language_detection_threshold: gr.Number
	language_detection_segments: gr.Number
	is_bgm_separate: gr.Checkbox
	uvr_model_size: gr.Dropdown
	uvr_device: gr.Dropdown
	uvr_segment_size: gr.Number
	uvr_save_file: gr.Checkbox
	uvr_enable_offload: gr.Checkbox
	"""
	A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
	This data class is used to mitigate the key-value problem between Gradio components and function parameters.
	Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
	See more about Gradio pre-processing: https://www.gradio.app/docs/components

	Attributes
	----------
	model_size: gr.Dropdown
	Whisper model size.

	lang: gr.Dropdown
	Source language of the file to transcribe.

	is_translate: gr.Checkbox
	Boolean value that determines whether to translate to English.
	It's Whisper's feature to translate speech from another language directly into English end-to-end.

	beam_size: gr.Number
	Int value that is used for decoding option.

	log_prob_threshold: gr.Number
	If the average log probability over sampled tokens is below this value, treat as failed.

	no_speech_threshold: gr.Number
	If the no_speech probability is higher than this value AND
	the average log probability over sampled tokens is below `log_prob_threshold`,
	consider the segment as silent.

	compute_type: gr.Dropdown
	compute type for transcription.
	see more info : https://opennmt.net/CTranslate2/quantization.html

	best_of: gr.Number
	Number of candidates when sampling with non-zero temperature.

	patience: gr.Number
	Beam search patience factor.

	condition_on_previous_text: gr.Checkbox
	if True, the previous output of the model is provided as a prompt for the next window;
	disabling may make the text inconsistent across windows, but the model becomes less prone to
	getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.

	initial_prompt: gr.Textbox
	Optional text to provide as a prompt for the first window. This can be used to provide, or
	"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
	to make it more likely to predict those word correctly.

	temperature: gr.Slider
	Temperature for sampling. It can be a tuple of temperatures,
	which will be successively used upon failures according to either
	`compression_ratio_threshold` or `log_prob_threshold`.

	compression_ratio_threshold: gr.Number
	If the gzip compression ratio is above this value, treat as failed

	vad_filter: gr.Checkbox
	Enable the voice activity detection (VAD) to filter out parts of the audio
	without speech. This step is using the Silero VAD model
	https://github.com/snakers4/silero-vad.

	threshold: gr.Slider
	This parameter is related with Silero VAD. Speech threshold.
	Silero VAD outputs speech probabilities for each audio chunk,
	probabilities ABOVE this value are considered as SPEECH. It is better to tune this
	parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.

	min_speech_duration_ms: gr.Number
	This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.

	max_speech_duration_s: gr.Number
	This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
	than max_speech_duration_s will be split at the timestamp of the last silence that
	lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
	split aggressively just before max_speech_duration_s.

	min_silence_duration_ms: gr.Number
	This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
	before separating it

	speech_pad_ms: gr.Number
	This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side

	batch_size: gr.Number
	This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe

	is_diarize: gr.Checkbox
	This parameter is related with whisperx. Boolean value that determines whether to diarize or not.

	hf_token: gr.Textbox
	This parameter is related with whisperx. Huggingface token is needed to download diarization models.
	Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements

	diarization_device: gr.Dropdown
	This parameter is related with whisperx. Device to run diarization model

	length_penalty: gr.Number
	This parameter is related to faster-whisper. Exponential length penalty constant.

	repetition_penalty: gr.Number
	This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
	(set > 1 to penalize).

	no_repeat_ngram_size: gr.Number
	This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).

	prefix: gr.Textbox
	This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.

	suppress_blank: gr.Checkbox
	This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.

	suppress_tokens: gr.Textbox
	This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
	of symbols as defined in the model config.json file.

	max_initial_timestamp: gr.Number
	This parameter is related to faster-whisper. The initial timestamp cannot be later than this.

	word_timestamps: gr.Checkbox
	This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
	and dynamic time warping, and include the timestamps for each word in each segment.

	prepend_punctuations: gr.Textbox
	This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
	with the next word.

	append_punctuations: gr.Textbox
	This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
	with the previous word.

	max_new_tokens: gr.Number
	This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
	the maximum will be set by the default max_length.

	chunk_length: gr.Number
	This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
	If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.

	hallucination_silence_threshold: gr.Number
	This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
	(in seconds) when a possible hallucination is detected.

	hotwords: gr.Textbox
	This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.

	language_detection_threshold: gr.Number
	This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.

	language_detection_segments: gr.Number
	This parameter is related to faster-whisper. Number of segments to consider for the language detection.

	is_separate_bgm: gr.Checkbox
	This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.

	uvr_model_size: gr.Dropdown
	This parameter is related to UVR. UVR model size.

	uvr_device: gr.Dropdown
	This parameter is related to UVR. Device to run UVR model.

	uvr_segment_size: gr.Number
	This parameter is related to UVR. Segment size for UVR model.

	uvr_save_file: gr.Checkbox
	This parameter is related to UVR. Boolean value that determines whether to save the file or not.

	uvr_enable_offload: gr.Checkbox
	This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
	after each transcription.
	"""

	def as_list(self) -> list:
	"""
	Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
	See more about Gradio pre-processing: : https://www.gradio.app/docs/components

	Returns
	----------
	A list of Gradio components
	"""
	return [getattr(self, f.name) for f in fields(self)]

	@staticmethod
	def as_value(*args) -> 'WhisperValues':
	"""
	To use Whisper parameters in function after Gradio post-processing.
	See more about Gradio post-processing: : https://www.gradio.app/docs/components

	Returns
	----------
	WhisperValues
	Data class that has values of parameters
	"""
	return WhisperValues(*args)


	@dataclass
	class WhisperValues:
	model_size: str = "large-v3"
	lang: Optional[str] = None
	is_translate: bool = False
	beam_size: int = 5
	log_prob_threshold: float = -1.0
	no_speech_threshold: float = 0.6
	compute_type: str = "float16"
	best_of: int = 5
	patience: float = 1.0
	condition_on_previous_text: bool = True
	prompt_reset_on_temperature: float = 0.5
	initial_prompt: Optional[str] = None
	temperature: float = 0.0
	compression_ratio_threshold: float = 2.4
	vad_filter: bool = False
	threshold: float = 0.5
	min_speech_duration_ms: int = 250
	max_speech_duration_s: float = float("inf")
	min_silence_duration_ms: int = 2000
	speech_pad_ms: int = 400
	batch_size: int = 24
	is_diarize: bool = False
	hf_token: str = ""
	diarization_device: str = "cuda"
	length_penalty: float = 1.0
	repetition_penalty: float = 1.0
	no_repeat_ngram_size: int = 0
	prefix: Optional[str] = None
	suppress_blank: bool = True
	suppress_tokens: Optional[str] = "[-1]"
	max_initial_timestamp: float = 0.0
	word_timestamps: bool = False
	prepend_punctuations: Optional[str] = "\"'“¿([{-"
	append_punctuations: Optional[str] = "\"'.。,，!！?？:：”)]}、"
	max_new_tokens: Optional[int] = None
	chunk_length: Optional[int] = 30
	hallucination_silence_threshold: Optional[float] = None
	hotwords: Optional[str] = None
	language_detection_threshold: Optional[float] = None
	language_detection_segments: int = 1
	is_bgm_separate: bool = False
	uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
	uvr_device: str = "cuda"
	uvr_segment_size: int = 256
	uvr_save_file: bool = False
	uvr_enable_offload: bool = True
	"""
	A data class to use Whisper parameters.
	"""

	def to_yaml(self) -> Dict:
	data = {
	"whisper": {
	"model_size": self.model_size,
	"lang": "Automatic Detection" if self.lang is None else self.lang,
	"is_translate": self.is_translate,
	"beam_size": self.beam_size,
	"log_prob_threshold": self.log_prob_threshold,
	"no_speech_threshold": self.no_speech_threshold,
	"best_of": self.best_of,
	"patience": self.patience,
	"condition_on_previous_text": self.condition_on_previous_text,
	"prompt_reset_on_temperature": self.prompt_reset_on_temperature,
	"initial_prompt": None if not self.initial_prompt else self.initial_prompt,
	"temperature": self.temperature,
	"compression_ratio_threshold": self.compression_ratio_threshold,
	"batch_size": self.batch_size,
	"length_penalty": self.length_penalty,
	"repetition_penalty": self.repetition_penalty,
	"no_repeat_ngram_size": self.no_repeat_ngram_size,
	"prefix": None if not self.prefix else self.prefix,
	"suppress_blank": self.suppress_blank,
	"suppress_tokens": self.suppress_tokens,
	"max_initial_timestamp": self.max_initial_timestamp,
	"word_timestamps": self.word_timestamps,
	"prepend_punctuations": self.prepend_punctuations,
	"append_punctuations": self.append_punctuations,
	"max_new_tokens": self.max_new_tokens,
	"chunk_length": self.chunk_length,
	"hallucination_silence_threshold": self.hallucination_silence_threshold,
	"hotwords": None if not self.hotwords else self.hotwords,
	"language_detection_threshold": self.language_detection_threshold,
	"language_detection_segments": self.language_detection_segments,
	},
	"vad": {
	"vad_filter": self.vad_filter,
	"threshold": self.threshold,
	"min_speech_duration_ms": self.min_speech_duration_ms,
	"max_speech_duration_s": self.max_speech_duration_s,
	"min_silence_duration_ms": self.min_silence_duration_ms,
	"speech_pad_ms": self.speech_pad_ms,
	},
	"diarization": {
	"is_diarize": self.is_diarize,
	"hf_token": self.hf_token
	},
	"bgm_separation": {
	"is_separate_bgm": self.is_bgm_separate,
	"model_size": self.uvr_model_size,
	"segment_size": self.uvr_segment_size,
	"save_file": self.uvr_save_file,
	"enable_offload": self.uvr_enable_offload
	},
	}
	return data

	def as_list(self) -> list:
	"""
	Converts the data class attributes into a list

	Returns
	----------
	A list of Whisper parameters
	"""
	return [getattr(self, f.name) for f in fields(self)]