Spaces:

ArmelRandy
/

MT

Runtime error

ArmelRandy commited on Dec 12, 2023

Commit

b3d1867

1 Parent(s): 320fb86

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,14 +3,19 @@ import json
 import shutil
 import gradio as gr
 from datasets import load_dataset
-from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 def load_data():
     dataset = load_dataset("ArmelR/oasst1_guanaco_english", use_auth_token=HF_TOKEN)
     return dataset
 samples = load_data()
 splits = list(samples.keys())
 languages = ["Wolof"]
@@ -36,19 +41,24 @@ def identity(index, split):
     return ds["prompt"], ds["completion"]
 def save(index, language, split, prompt, completion):
     if len(prompt) != 0 and len(completion) != 0 :
         print("Saving ...")
-        with open("/home/user/app/output.jsonl", "a") as fout :
-            fout.write(
-                json.dumps(
-                    {
-                        "prompt" : prompt,
-                        "completion" : completion,
-                        "language" : language,
-                        "index" : index
-                    }
-                )+"\n"
-            )
         next_index = min(1+index, len(samples[split])-1)
         return next_index, samples[split][next_index]["prompt"], samples[split][next_index]["completion"], "", ""
     else :

 import shutil
 import gradio as gr
 from datasets import load_dataset
+from huggingface_hub import upload_file
+from io import StringIO
+import pandas as pd
+import datetime
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+DIALOGUES_DATASET = "ArmelRandy/MT_dialogues"
 def load_data():
     dataset = load_dataset("ArmelR/oasst1_guanaco_english", use_auth_token=HF_TOKEN)
     return dataset
 samples = load_data()
 splits = list(samples.keys())
 languages = ["Wolof"]
     return ds["prompt"], ds["completion"]
 def save(index, language, split, prompt, completion):
+    buffer = StringIO()
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
+    file_name = f"prompts_{timestamp}.jsonl"
     if len(prompt) != 0 and len(completion) != 0 :
         print("Saving ...")
+        data = {"prompt": prompt, "completion": completion, "language": language, "index": index}
+        pd.DataFrame([data]).to_json(buffer, orient="records", lines=True)
+        # Push to Hub
+        upload_file(
+            path_in_repo=f"{now.date()}/{now.hour}/{file_name}",
+            path_or_fileobj=buffer.getvalue().encode(),
+            repo_id=DIALOGUES_DATASET,
+            token=HF_TOKEN,
+            repo_type="dataset",
+        )
+        # Clean and rerun
+        buffer.close()
         next_index = min(1+index, len(samples[split])-1)
         return next_index, samples[split][next_index]["prompt"], samples[split][next_index]["completion"], "", ""
     else :