import os import gradio as gr import pandas as pd from datasets import load_dataset from transformers import MarianMTModel, MarianTokenizer MODEL_NAME = "Helsinki-NLP/opus-mt-fr-en" tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME) model = MarianMTModel.from_pretrained(MODEL_NAME) def translate_text(text): text = str(text)[:800] inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) outputs = model.generate(**inputs, max_length=512) return tokenizer.decode(outputs[0], skip_special_tokens=True) def translate_rows(n): dataset = load_dataset( "tcrouzet/journal-large", split=f"train[:{int(n)}]", token=os.environ["may"] ) rows = [] for row in dataset: text_fr = row["combined"] try: text_en = translate_text(text_fr) except Exception as e: text_en = f"TRANSLATION_ERROR: {e}" rows.append({ "id": row["id"], "title": row["Title"], "subtitle": row["Subtitle"], "date": row["Date"], "location": row["Location"], "tags": row["Tags"], "author": row["Author"], "text_fr": text_fr, "text_en": text_en }) output_file = "/tmp/translated_sample.csv" pd.DataFrame(rows).to_csv(output_file, index=False) return output_file with gr.Blocks() as demo: gr.Markdown("# French Journal Dataset Translator") n = gr.Number(value=10, precision=0, label="Rows to translate") btn = gr.Button("Translate") file_output = gr.File(label="Download translated CSV") btn.click(fn=translate_rows, inputs=n, outputs=file_output) demo.launch()