| |
|
| | """
|
| | Created on Tue Nov 22 18:32:21 2022
|
| |
|
| | @author: renyu
|
| | """
|
| |
|
| |
|
| |
|
| | import pysrt
|
| | import pandas as pd
|
| | import re
|
| |
|
| | import shutil
|
| | import pysrt
|
| | import ffmpeg
|
| | import pydub
|
| | import os, sys, glob, pathlib
|
| |
|
| | srcDir= 'shortDir'
|
| | tgtDir= 'shortDir_20'
|
| |
|
| | os.makedirs(srcDir, exist_ok=True)
|
| | os.makedirs(tgtDir, exist_ok=True)
|
| |
|
| | def ryCreateDataset(fnBase, srcDir= srcDir, timeLimit= 20):
|
| |
|
| | fnBase= os.path.basename(fnBase).removesuffix('.mp4').removesuffix('.mp3')
|
| | fn_srt= f"{srcDir}/{fnBase}.zh-TW.srt"
|
| | if os.path.isfile(fn_srt) == False:
|
| | fn_srt= f"{srcDir}/{fnBase}.zh-CN.srt"
|
| | if os.path.isfile(fn_srt) == False:
|
| | fn_srt= f"{srcDir}/{fnBase}.zh-Hans.srt"
|
| | if os.path.isfile(fn_srt) == False:
|
| | fn_srt= f"{srcDir}/{fnBase}.srt"
|
| | if 'Combine' in fn_srt:
|
| | fn_srt= f"{srcDir}/{fnBase}.srt"
|
| |
|
| | fn_mp3= f"{srcDir}/{fnBase}.mp3"
|
| | fn_mp4= f"{srcDir}/{fnBase}.mp4"
|
| |
|
| | if not os.path.isfile(fn_mp3):
|
| | cmd= f'ffmpeg -i "{fn_mp4}" "{fn_mp3}"'
|
| | os.system(cmd)
|
| |
|
| | mp3= pydub.AudioSegment.from_mp3(fn_mp3)
|
| | srt= pysrt.open(fn_srt)
|
| |
|
| |
|
| |
|
| | os.makedirs(f'{tgtDir}/{fnBase}', exist_ok= True)
|
| | os.makedirs(f'{tgtDir}/{fnBase}/data', exist_ok= True)
|
| |
|
| | fn_csv= "metadata.csv"
|
| |
|
| |
|
| | T= 1000 * timeLimit
|
| |
|
| | with open(f'{tgtDir}/{fnBase}/{fn_csv}',
|
| | 'w',
|
| | encoding='utf8') as fp:
|
| |
|
| | fp.write('file_name,transcription\n')
|
| |
|
| | t0= 0
|
| | sText= ''
|
| | k=0
|
| | t2 = 0
|
| |
|
| | for i, s in enumerate(srt):
|
| |
|
| | if t0==0:
|
| | t0= s.start.ordinal
|
| | sText= ''
|
| |
|
| | t1= s.end.ordinal
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | dt= t1-t0
|
| |
|
| | if dt>T:
|
| | a= mp3[t0:t2]
|
| | fn= f'{fnBase}_{k:04d}.mp3'
|
| | a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
| |
|
| | q= f'"data/{fn}", "{sText}"\n'
|
| | fp.write(q)
|
| |
|
| | t0= 0
|
| | sText= ''
|
| | k+=1
|
| | else:
|
| | t2 = t1
|
| | txt= re.sub('\n',' ', s.text)
|
| | sText += txt + ' '
|
| | if t0!=0:
|
| | a= mp3[t0:t1]
|
| | fn= f'{fnBase}_{k:04d}.mp3'
|
| | a.export(f'{tgtDir}/{fnBase}/data/{fn}')
|
| |
|
| | q= f'"data/{fn}", "{sText}"\n'
|
| | fp.write(q)
|
| |
|
| |
|
| | cL= glob.glob(f'{srcDir}/*.mp3')
|
| | for c in cL:
|
| | print(c)
|
| | ryCreateDataset(c, srcDir)
|
| |
|