ForeignWhispersNYU / text2text.py
hammamiomar
add all
da9fc8c
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
def unpackTsv(tsvFile):
with open(tsvFile, 'r') as f:
lines = f.readlines()
lines = [l.strip().split('\t') for l in lines]
return lines
def translateSentences(sentences,languageSecond="German",languageFirst="English"):
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
task_prefix = f"translate {languageFirst} to {languageSecond}: "
# use different length sentences to test batching
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
output_sequences = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
do_sample=False, # disable sampling to test if batching affects output
)
return (tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
def translateTsv(tsvFile,languageSecond="German",languageFirst="English"):
lines=unpackTsv(tsvFile)
sentences=[l[2] for l in lines[1:]]
translatedSentences=translateSentences(sentences,languageSecond,languageFirst)
for i in range(1,len(lines)):
lines[i][2]=translatedSentences[i-1]
return lines
def writeTsv(lines,outputFile):
with open(outputFile, 'w') as f:
for l in lines:
f.write('\t'.join(l)+'\n')
def translateTsvFile(tsvFile, outputFile, languageSecond="German", languageFirst="English"):
lines = translateTsv(tsvFile, languageSecond, languageFirst)
outputFilePath = os.path.join(outputFile, f"{languageSecond}_{os.path.basename(tsvFile)}")
writeTsv(lines, outputFilePath)
def translateTsvFolder(tsvFolder, outputFolder, languageSecond="German", languageFirst="English"):
os.makedirs(outputFolder, exist_ok=True)
for file in os.listdir(tsvFolder):
filepath = os.path.join(tsvFolder, file)
translateTsvFile(filepath, outputFolder, languageSecond, languageFirst)