Spaces:
Sleeping
Sleeping
File size: 2,771 Bytes
5aae02b d4362f5 5aae02b d4362f5 1de986b 5aae02b 23feff5 5aae02b d4362f5 5aae02b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import numpy as np
def cut_audio(audio, timestamps):
# Convert audio to mono if it's stereo
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
# Calculate sample rate and length of each sample
sample_rate = 16000
sample_length = 1 / sample_rate
# Initialize a list to store the cut audio segments
cut_segments = []
# Iterate over the timestamps and cut the audio accordingly
for index, timestamp in enumerate(timestamps):
start_time = timestamp["start"]
end_time = timestamp["end"]
start_sample = int(start_time / sample_length)
if index == len(timestamps) - 1:
end_sample = int(end_time / sample_length) + int(1 / sample_length)
else:
end_sample = int(end_time / sample_length)
cut_segment = audio[start_sample:end_sample]
cut_segments.append(cut_segment)
# Concatenate the cut audio segments
final_audio = np.concatenate(cut_segments)
return final_audio, sample_rate
def get_word_timestamps(segments):
word_timestamps = []
for segment in segments:
for word in segment.words:
word_info = {
'text':word.word,
'start': word.start,
'end': word.end
}
word_timestamps.append(word_info)
return word_timestamps
def get_transcription(word_timestamps):
transcription=''
for i in word_timestamps:
transcription+=i['text']
return transcription
def get_modified_timestamps(word_timestamps,filtered_text):
mod_timestemp=[]
for h in filtered_text[0].split():
c=0
for i in word_timestamps:
c=c+1
i['text']=i['text'].replace(' ','')
if h==i['text']:
mod_timestemp.append(i)
break
mod_timestemp.pop()
return mod_timestemp
def filterText(text, model,tokenizer):
device = 'cpu'
model = model.to(device)
text_encoding = tokenizer(
text,
max_length = 512,
padding = 'max_length',
truncation = True,
return_attention_mask = True,
add_special_tokens = True,
return_tensors = 'pt'
)
generated_ids = model.generate(
input_ids=text_encoding['input_ids'].to(device),
attention_mask=text_encoding['attention_mask'].to(device),
max_new_tokens=150,
no_repeat_ngram_size=2,
min_new_tokens= 1 ,
repetition_penalty=2.0,
length_penalty=0.5,
num_beams = 10,
num_return_sequences=1,
)
preds = [
tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
for gen_id in generated_ids
]
return preds |