| import os |
| import gradio as gr |
| import pandas as pd |
| from datasets import load_dataset |
| from transformers import MarianMTModel, MarianTokenizer |
|
|
| MODEL_NAME = "Helsinki-NLP/opus-mt-fr-en" |
|
|
| tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME) |
| model = MarianMTModel.from_pretrained(MODEL_NAME) |
|
|
| def translate_text(text): |
| text = str(text)[:800] |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) |
| outputs = model.generate(**inputs, max_length=512) |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| def translate_rows(n): |
| dataset = load_dataset( |
| "tcrouzet/journal-large", |
| split=f"train[:{int(n)}]", |
| token=os.environ["may"] |
| ) |
|
|
| rows = [] |
| for row in dataset: |
| text_fr = row["combined"] |
|
|
| try: |
| text_en = translate_text(text_fr) |
| except Exception as e: |
| text_en = f"TRANSLATION_ERROR: {e}" |
|
|
| rows.append({ |
| "id": row["id"], |
| "title": row["Title"], |
| "subtitle": row["Subtitle"], |
| "date": row["Date"], |
| "location": row["Location"], |
| "tags": row["Tags"], |
| "author": row["Author"], |
| "text_fr": text_fr, |
| "text_en": text_en |
| }) |
|
|
| output_file = "/tmp/translated_sample.csv" |
| pd.DataFrame(rows).to_csv(output_file, index=False) |
| return output_file |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# French Journal Dataset Translator") |
| n = gr.Number(value=10, precision=0, label="Rows to translate") |
| btn = gr.Button("Translate") |
| file_output = gr.File(label="Download translated CSV") |
|
|
| btn.click(fn=translate_rows, inputs=n, outputs=file_output) |
|
|
| demo.launch() |