File size: 2,929 Bytes
eab92fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""

Created on Tue Nov 22 18:32:21 2022



@author: renyu

"""
#
# cutMp3bySrt.py

import pysrt
import pandas as pd
import re

import shutil
import pysrt
import ffmpeg
import pydub
import os, sys, glob, pathlib

srcDir= 'shortDir' 
tgtDir= 'shortDir_20'

os.makedirs(srcDir, exist_ok=True)
os.makedirs(tgtDir, exist_ok=True)

def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
    
    fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
    fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
    if os.path.isfile(fn_srt) == False:
        fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
        if os.path.isfile(fn_srt) == False:
            fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
            if os.path.isfile(fn_srt) == False:
                fn_srt= f"{srcDir}/{fnBase}.srt"
    if 'Combine' in fn_srt:
        fn_srt= f"{srcDir}/{fnBase}.srt"
        
    fn_mp3= f"{srcDir}/{fnBase}.mp3"
    fn_mp4= f"{srcDir}/{fnBase}.mp4"

    if not os.path.isfile(fn_mp3):
        cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
        os.system(cmd)

    mp3= pydub.AudioSegment.from_mp3(fn_mp3)
    srt= pysrt.open(fn_srt)

    #fnBase
    
    os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
    os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)

    fn_csv= "metadata.csv"


    T= 1000 * timeLimit  # timeLimit sec

    with open(f'{tgtDir}/{fnBase}/{fn_csv}', 
              'w', 
              encoding='utf8') as fp:
        
        fp.write('file_name,transcription\n')

        t0= 0
        sText= ''
        k=0
        t2 = 0

        for i, s in enumerate(srt):

            if t0==0:
                t0= s.start.ordinal
                sText= ''

            t1= s.end.ordinal
            
            
            # 文字並未做 normalization, 
            # 只是原 srt 中的「換行」用「空白」取代            

            #if sText=='':
            #    sText= s.text

            dt= t1-t0

            if dt>T:
                a= mp3[t0:t2]           
                fn= f'{fnBase}_{k:04d}.mp3'
                a.export(f'{tgtDir}/{fnBase}/data/{fn}')
                #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
                q= f'"data/{fn}", "{sText}"\n'
                fp.write(q)

                t0= 0
                sText= ''
                k+=1
            else:
                t2 = t1
                txt= re.sub('\n',' ', s.text)
                sText += txt + ' '
        if t0!=0:
            a= mp3[t0:t1]
            fn= f'{fnBase}_{k:04d}.mp3'
            a.export(f'{tgtDir}/{fnBase}/data/{fn}')
            #q= f'"{tgtDir}/{fnBase}/data/{fn}", "{sText}"\n'
            q= f'"data/{fn}", "{sText}"\n'
            fp.write(q)


cL= glob.glob(f'{srcDir}/*.mp3')
for c in cL:
    print(c)
    ryCreateDataset(c, srcDir)