Spaces:
Runtime error
Runtime error
| import os | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration | |
| def unpackTsv(tsvFile): | |
| with open(tsvFile, 'r') as f: | |
| lines = f.readlines() | |
| lines = [l.strip().split('\t') for l in lines] | |
| return lines | |
| def translateSentences(sentences,languageSecond="German",languageFirst="English"): | |
| tokenizer = T5Tokenizer.from_pretrained("t5-small") | |
| model = T5ForConditionalGeneration.from_pretrained("t5-small") | |
| task_prefix = f"translate {languageFirst} to {languageSecond}: " | |
| # use different length sentences to test batching | |
| inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) | |
| output_sequences = model.generate( | |
| input_ids=inputs["input_ids"], | |
| attention_mask=inputs["attention_mask"], | |
| do_sample=False, # disable sampling to test if batching affects output | |
| ) | |
| return (tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) | |
| def translateTsv(tsvFile,languageSecond="German",languageFirst="English"): | |
| lines=unpackTsv(tsvFile) | |
| sentences=[l[2] for l in lines[1:]] | |
| translatedSentences=translateSentences(sentences,languageSecond,languageFirst) | |
| for i in range(1,len(lines)): | |
| lines[i][2]=translatedSentences[i-1] | |
| return lines | |
| def writeTsv(lines,outputFile): | |
| with open(outputFile, 'w') as f: | |
| for l in lines: | |
| f.write('\t'.join(l)+'\n') | |
| def translateTsvFile(tsvFile, outputFile, languageSecond="German", languageFirst="English"): | |
| lines = translateTsv(tsvFile, languageSecond, languageFirst) | |
| outputFilePath = os.path.join(outputFile, f"{languageSecond}_{os.path.basename(tsvFile)}") | |
| writeTsv(lines, outputFilePath) | |
| def translateTsvFolder(tsvFolder, outputFolder, languageSecond="German", languageFirst="English"): | |
| os.makedirs(outputFolder, exist_ok=True) | |
| for file in os.listdir(tsvFolder): | |
| filepath = os.path.join(tsvFolder, file) | |
| translateTsvFile(filepath, outputFolder, languageSecond, languageFirst) |