gagannarula commited on
Commit
be2d363
·
verified ·
1 Parent(s): a0a0836

module for storing data

Browse files
Files changed (1) hide show
  1. data_store.py +53 -0
data_store.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import uuid
4
+ import json
5
+ from huggingface_hub import HfApi, HfFileSystem
6
+
7
+ DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs"
8
+ SPLIT = "test"
9
+ TESTING = os.getenv("TESTING", "0") == "1"
10
+ api = HfApi()
11
+ # Upload audio
12
+ # check if file exists
13
+ hf_fs = HfFileSystem()
14
+
15
+
16
+ def upload_data(audio: str | Path, user_text: str, model_response: str):
17
+ data_id = str(uuid.uuid4())
18
+ if TESTING:
19
+ data_id = "test-" + data_id
20
+ # Audio path in repo
21
+ suffix = Path(audio).suffix
22
+ audio_p = f"{SPLIT}/audio/" + data_id + suffix
23
+
24
+ api.upload_file(
25
+ path_or_fileobj=str(audio),
26
+ path_in_repo=audio_p,
27
+ repo_id=DATASET_REPO,
28
+ repo_type="dataset",
29
+ )
30
+
31
+ text = {
32
+ "user_message": user_text,
33
+ "model_response": model_response,
34
+ "file_name": audio_p,
35
+ "original_file_name": os.path.basename(audio),
36
+ "id": data_id,
37
+ }
38
+
39
+ # Append to a jsonl file in the repo
40
+ # APPEND DOESNT WORK, have to open first
41
+ if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"):
42
+ with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f:
43
+ lines = f.readlines()
44
+ lines.append(json.dumps(text) + "\n")
45
+ with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
46
+ f.writelines(lines)
47
+ else:
48
+ with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
49
+ f.write(json.dumps(text) + "\n")
50
+
51
+ # Write a separate file instead
52
+ # with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f:
53
+ # json.dump(text, f)