import os from transformers import T5Tokenizer, T5ForConditionalGeneration def unpackTsv(tsvFile): with open(tsvFile, 'r') as f: lines = f.readlines() lines = [l.strip().split('\t') for l in lines] return lines def translateSentences(sentences,languageSecond="German",languageFirst="English"): tokenizer = T5Tokenizer.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small") task_prefix = f"translate {languageFirst} to {languageSecond}: " # use different length sentences to test batching inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) output_sequences = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], do_sample=False, # disable sampling to test if batching affects output ) return (tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) def translateTsv(tsvFile,languageSecond="German",languageFirst="English"): lines=unpackTsv(tsvFile) sentences=[l[2] for l in lines[1:]] translatedSentences=translateSentences(sentences,languageSecond,languageFirst) for i in range(1,len(lines)): lines[i][2]=translatedSentences[i-1] return lines def writeTsv(lines,outputFile): with open(outputFile, 'w') as f: for l in lines: f.write('\t'.join(l)+'\n') def translateTsvFile(tsvFile, outputFile, languageSecond="German", languageFirst="English"): lines = translateTsv(tsvFile, languageSecond, languageFirst) outputFilePath = os.path.join(outputFile, f"{languageSecond}_{os.path.basename(tsvFile)}") writeTsv(lines, outputFilePath) def translateTsvFolder(tsvFolder, outputFolder, languageSecond="German", languageFirst="English"): os.makedirs(outputFolder, exist_ok=True) for file in os.listdir(tsvFolder): filepath = os.path.join(tsvFolder, file) translateTsvFile(filepath, outputFolder, languageSecond, languageFirst)