sifangchu commited on
Commit
e45bf79
·
verified ·
1 Parent(s): 627ef7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -9
app.py CHANGED
@@ -1,21 +1,58 @@
1
  import os
2
  import gradio as gr
 
3
  from datasets import load_dataset
 
4
 
5
- def load_and_show():
 
 
 
 
 
 
 
 
 
 
 
6
  dataset = load_dataset(
7
  "tcrouzet/journal-large",
8
- split="train[:1]",
9
  token=os.environ["may"]
10
  )
11
- return str({
12
- "columns": dataset.column_names,
13
- "first_row": dataset[0]
14
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  with gr.Blocks() as demo:
17
- gr.Markdown("# Dataset Debug")
18
- output = gr.Textbox()
19
- demo.load(fn=load_and_show, inputs=None, outputs=output)
 
 
 
20
 
21
  demo.launch()
 
1
  import os
2
  import gradio as gr
3
+ import pandas as pd
4
  from datasets import load_dataset
5
+ from transformers import MarianMTModel, MarianTokenizer
6
 
7
+ MODEL_NAME = "Helsinki-NLP/opus-mt-fr-en"
8
+
9
+ tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
10
+ model = MarianMTModel.from_pretrained(MODEL_NAME)
11
+
12
+ def translate_text(text):
13
+ text = str(text)[:800]
14
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
15
+ outputs = model.generate(**inputs, max_length=512)
16
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
17
+
18
+ def translate_rows(n):
19
  dataset = load_dataset(
20
  "tcrouzet/journal-large",
21
+ split=f"train[:{int(n)}]",
22
  token=os.environ["may"]
23
  )
24
+
25
+ rows = []
26
+ for row in dataset:
27
+ text_fr = row["combined"]
28
+
29
+ try:
30
+ text_en = translate_text(text_fr)
31
+ except Exception as e:
32
+ text_en = f"TRANSLATION_ERROR: {e}"
33
+
34
+ rows.append({
35
+ "id": row["id"],
36
+ "title": row["Title"],
37
+ "subtitle": row["Subtitle"],
38
+ "date": row["Date"],
39
+ "location": row["Location"],
40
+ "tags": row["Tags"],
41
+ "author": row["Author"],
42
+ "text_fr": text_fr,
43
+ "text_en": text_en
44
+ })
45
+
46
+ output_file = "/tmp/translated_sample.csv"
47
+ pd.DataFrame(rows).to_csv(output_file, index=False)
48
+ return output_file
49
 
50
  with gr.Blocks() as demo:
51
+ gr.Markdown("# French Journal Dataset Translator")
52
+ n = gr.Number(value=10, precision=0, label="Rows to translate")
53
+ btn = gr.Button("Translate")
54
+ file_output = gr.File(label="Download translated CSV")
55
+
56
+ btn.click(fn=translate_rows, inputs=n, outputs=file_output)
57
 
58
  demo.launch()