Juna190825 commited on
Commit
dc3800a
·
verified ·
1 Parent(s): f7dbb6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -53
app.py CHANGED
@@ -1,36 +1,9 @@
1
  # # app.py
2
- # import os
3
- # import gradio as gr
4
- # from huggingface_hub import HfApi
5
-
6
- # api = HfApi()
7
-
8
- # def list_dataset_files(repo_id: str):
9
- # # repo_id example: "username/dataset_name"
10
- # try:
11
- # files = api.list_repo_files(
12
- # repo_id=repo_id,
13
- # repo_type="dataset",
14
- # token=os.getenv("HF_TOKEN") # None for public datasets
15
- # )
16
- # if not files:
17
- # return "No files found."
18
- # return "\n".join(files)
19
- # except Exception as e:
20
- # return f"Error: {e}"
21
-
22
- # with gr.Blocks() as demo:
23
- # gr.Markdown("# List files in a Hugging Face dataset repo")
24
- # repo_input = gr.Textbox(label="Dataset repo_id", value="Juna190825/ZomiAudioDataset")
25
- # output = gr.Textbox(label="Files", lines=20)
26
- # btn = gr.Button("List files")
27
-
28
- # btn.click(list_dataset_files, inputs=repo_input, outputs=output)
29
-
30
- # demo.launch()
31
-
32
-
33
  import os
 
 
 
 
34
  import gradio as gr
35
  from huggingface_hub import HfApi, hf_hub_download
36
 
@@ -48,27 +21,68 @@ def list_dataset_files(repo_id: str):
48
  if not files:
49
  return [], "No files found."
50
 
51
- # Filter audio files
52
  audio_files = [f for f in files if os.path.splitext(f)[1].lower() in AUDIO_EXTS]
53
-
54
  return audio_files, "\n".join(files)
55
 
56
  except Exception as e:
57
  return [], f"Error: {e}"
58
 
59
- # def load_audio(repo_id, file_path):
60
- # try:
61
- # local_path = hf_hub_download(
62
- # repo_id=repo_id,
63
- # filename=file_path,
64
- # repo_type="dataset",
65
- # token=os.getenv("HF_TOKEN")
66
- # )
67
- # return local_path
68
- # except Exception as e:
69
- # return None
70
- import shutil
71
- import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def load_audio(repo_id, file_path):
74
  try:
@@ -79,19 +93,51 @@ def load_audio(repo_id, file_path):
79
  token=os.getenv("HF_TOKEN")
80
  )
81
 
82
- # Copy to a temp file Gradio is allowed to serve
83
  tmp_dir = tempfile.gettempdir()
84
  safe_path = os.path.join(tmp_dir, os.path.basename(local_path))
85
  shutil.copy(local_path, safe_path)
86
 
87
  return safe_path
88
 
89
- except Exception as e:
90
  return None
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  with gr.Blocks() as demo:
94
- gr.Markdown("# 🎧 List & Play Audio Files from a HF Dataset Repo")
95
 
96
  repo_input = gr.Textbox(label="Dataset repo_id", value="username/dataset_name")
97
 
@@ -99,21 +145,21 @@ with gr.Blocks() as demo:
99
  file_list = gr.Dropdown(label="Audio files", choices=[])
100
  play_audio = gr.Audio(label="Audio Player")
101
 
 
102
  output = gr.Textbox(label="All Files", lines=20)
 
103
  btn = gr.Button("List files")
104
 
105
- # When clicking "List files"
106
  def update_files(repo_id):
107
  audio_files, all_files_text = list_dataset_files(repo_id)
108
  return gr.Dropdown(choices=audio_files, value=None), all_files_text
109
 
110
  btn.click(update_files, inputs=repo_input, outputs=[file_list, output])
111
 
112
- # When selecting an audio file
113
  file_list.change(
114
- load_audio,
115
  inputs=[repo_input, file_list],
116
- outputs=play_audio
117
  )
118
 
119
  demo.launch()
 
1
  # # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
+ import json
4
+ import csv
5
+ import tempfile
6
+ import shutil
7
  import gradio as gr
8
  from huggingface_hub import HfApi, hf_hub_download
9
 
 
21
  if not files:
22
  return [], "No files found."
23
 
 
24
  audio_files = [f for f in files if os.path.splitext(f)[1].lower() in AUDIO_EXTS]
 
25
  return audio_files, "\n".join(files)
26
 
27
  except Exception as e:
28
  return [], f"Error: {e}"
29
 
30
+
31
+ # -----------------------------
32
+ # METADATA LOADING
33
+ # -----------------------------
34
+
35
+ def load_metadata(repo_id):
36
+ """Download and parse metadata files if they exist."""
37
+ metadata = {}
38
+
39
+ # Possible metadata files
40
+ candidates = ["metadata.jsonl"] #, "metadata.json", "metadata.csv"]
41
+
42
+ for fname in candidates:
43
+ try:
44
+ path = hf_hub_download(
45
+ repo_id=repo_id,
46
+ filename=fname,
47
+ repo_type="dataset",
48
+ token=os.getenv("HF_TOKEN")
49
+ )
50
+
51
+ ext = os.path.splitext(fname)[1]
52
+
53
+ if ext == ".jsonl":
54
+ with open(path, "r", encoding="utf-8") as f:
55
+ for line in f:
56
+ item = json.loads(line)
57
+ audio = item.get("audio") or item.get("file")
58
+ if audio:
59
+ metadata[audio] = item
60
+
61
+ elif ext == ".json":
62
+ with open(path, "r", encoding="utf-8") as f:
63
+ data = json.load(f)
64
+ for item in data:
65
+ audio = item.get("audio") or item.get("file")
66
+ if audio:
67
+ metadata[audio] = item
68
+
69
+ elif ext == ".csv":
70
+ with open(path, "r", encoding="utf-8") as f:
71
+ reader = csv.DictReader(f)
72
+ for row in reader:
73
+ audio = row.get("audio") or row.get("file")
74
+ if audio:
75
+ metadata[audio] = row
76
+
77
+ except Exception:
78
+ pass # File doesn't exist, skip
79
+
80
+ return metadata
81
+
82
+
83
+ # -----------------------------
84
+ # AUDIO LOADING
85
+ # -----------------------------
86
 
87
  def load_audio(repo_id, file_path):
88
  try:
 
93
  token=os.getenv("HF_TOKEN")
94
  )
95
 
 
96
  tmp_dir = tempfile.gettempdir()
97
  safe_path = os.path.join(tmp_dir, os.path.basename(local_path))
98
  shutil.copy(local_path, safe_path)
99
 
100
  return safe_path
101
 
102
+ except Exception:
103
  return None
104
 
105
 
106
+ # -----------------------------
107
+ # COMBINED HANDLER
108
+ # -----------------------------
109
+
110
+ def load_audio_and_metadata(repo_id, file_path):
111
+ audio = load_audio(repo_id, file_path)
112
+ metadata = load_metadata(repo_id)
113
+
114
+ # Try to match metadata
115
+ info = metadata.get(file_path, "")
116
+
117
+ # If no metadata file, try matching .txt file
118
+ if not info:
119
+ txt_candidate = file_path.replace(".wav", ".txt").replace(".mp3", ".txt")
120
+ try:
121
+ txt_path = hf_hub_download(
122
+ repo_id=repo_id,
123
+ filename=txt_candidate,
124
+ repo_type="dataset",
125
+ token=os.getenv("HF_TOKEN")
126
+ )
127
+ with open(txt_path, "r", encoding="utf-8") as f:
128
+ info = f.read()
129
+ except Exception:
130
+ info = "No metadata found."
131
+
132
+ return audio, json.dumps(info, indent=2) if isinstance(info, dict) else info
133
+
134
+
135
+ # -----------------------------
136
+ # UI
137
+ # -----------------------------
138
+
139
  with gr.Blocks() as demo:
140
+ gr.Markdown("# 🎧 List, Play & View Metadata for HF Dataset Audio")
141
 
142
  repo_input = gr.Textbox(label="Dataset repo_id", value="username/dataset_name")
143
 
 
145
  file_list = gr.Dropdown(label="Audio files", choices=[])
146
  play_audio = gr.Audio(label="Audio Player")
147
 
148
+ metadata_box = gr.Textbox(label="Metadata / Text", lines=10)
149
  output = gr.Textbox(label="All Files", lines=20)
150
+
151
  btn = gr.Button("List files")
152
 
 
153
  def update_files(repo_id):
154
  audio_files, all_files_text = list_dataset_files(repo_id)
155
  return gr.Dropdown(choices=audio_files, value=None), all_files_text
156
 
157
  btn.click(update_files, inputs=repo_input, outputs=[file_list, output])
158
 
 
159
  file_list.change(
160
+ load_audio_and_metadata,
161
  inputs=[repo_input, file_list],
162
+ outputs=[play_audio, metadata_box]
163
  )
164
 
165
  demo.launch()