2ndspace / app.py
sifangchu's picture
Update app.py
e45bf79 verified
import os
import gradio as gr
import pandas as pd
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer
MODEL_NAME = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)
def translate_text(text):
text = str(text)[:800]
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(**inputs, max_length=512)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def translate_rows(n):
dataset = load_dataset(
"tcrouzet/journal-large",
split=f"train[:{int(n)}]",
token=os.environ["may"]
)
rows = []
for row in dataset:
text_fr = row["combined"]
try:
text_en = translate_text(text_fr)
except Exception as e:
text_en = f"TRANSLATION_ERROR: {e}"
rows.append({
"id": row["id"],
"title": row["Title"],
"subtitle": row["Subtitle"],
"date": row["Date"],
"location": row["Location"],
"tags": row["Tags"],
"author": row["Author"],
"text_fr": text_fr,
"text_en": text_en
})
output_file = "/tmp/translated_sample.csv"
pd.DataFrame(rows).to_csv(output_file, index=False)
return output_file
with gr.Blocks() as demo:
gr.Markdown("# French Journal Dataset Translator")
n = gr.Number(value=10, precision=0, label="Rows to translate")
btn = gr.Button("Translate")
file_output = gr.File(label="Download translated CSV")
btn.click(fn=translate_rows, inputs=n, outputs=file_output)
demo.launch()