File size: 2,771 Bytes
5aae02b
d4362f5
5aae02b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4362f5
 
 
1de986b
5aae02b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23feff5
5aae02b
 
 
 
 
d4362f5
5aae02b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np



def cut_audio(audio, timestamps):
    # Convert audio to mono if it's stereo
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    # Calculate sample rate and length of each sample
    sample_rate = 16000
    sample_length = 1 / sample_rate

    # Initialize a list to store the cut audio segments
    cut_segments = []

    # Iterate over the timestamps and cut the audio accordingly
    for index, timestamp in enumerate(timestamps):
        start_time = timestamp["start"]
        end_time = timestamp["end"]
        start_sample = int(start_time / sample_length)

        if index == len(timestamps) - 1:
            end_sample = int(end_time / sample_length) + int(1 / sample_length)
        else:
            end_sample = int(end_time / sample_length)

        cut_segment = audio[start_sample:end_sample]
        cut_segments.append(cut_segment)
    # Concatenate the cut audio segments
    final_audio = np.concatenate(cut_segments)

    return final_audio, sample_rate


def get_word_timestamps(segments):
    word_timestamps = []
    for segment in segments:

        for word in segment.words:
            word_info = {
                    'text':word.word,
                    'start': word.start,
                    'end': word.end
                }
            word_timestamps.append(word_info)
    return word_timestamps


def get_transcription(word_timestamps):
    transcription=''
    for i in word_timestamps:
        transcription+=i['text']
    return transcription



def get_modified_timestamps(word_timestamps,filtered_text):
    mod_timestemp=[]
    for h in filtered_text[0].split():

        c=0
        for i in word_timestamps:
            c=c+1
            i['text']=i['text'].replace(' ','')
            if h==i['text']:
                mod_timestemp.append(i)
                break
    mod_timestemp.pop()
    return mod_timestemp





def filterText(text, model,tokenizer):
    device = 'cpu'
    model = model.to(device)

    text_encoding = tokenizer(
        text,
        max_length = 512,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )

    generated_ids = model.generate(
        input_ids=text_encoding['input_ids'].to(device),
        attention_mask=text_encoding['attention_mask'].to(device),
        max_new_tokens=150,
        no_repeat_ngram_size=2,
        min_new_tokens= 1 ,
        repetition_penalty=2.0,
        length_penalty=0.5,
        num_beams = 10,
        num_return_sequences=1,  
    )
    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]

    return preds