Reza2kn commited on
Commit
c93b3b8
·
verified ·
1 Parent(s): 29d3797

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +90 -12
app.py CHANGED
@@ -6,6 +6,7 @@ import shutil
6
  import subprocess
7
  import sys
8
  import tempfile
 
9
  from datetime import datetime
10
  from pathlib import Path
11
  from typing import List, Optional, Tuple
@@ -441,7 +442,71 @@ def load_audio_bytes(audio_bytes: bytes, log: bool = False) -> Tuple[torch.Tenso
441
  return waveform, sample_rate
442
 
443
 
444
- def prepare_waveform_from_entry(entry, log: bool = False) -> Tuple[torch.Tensor, int]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  if entry is None:
446
  raise ValueError("Empty audio entry.")
447
 
@@ -467,14 +532,23 @@ def prepare_waveform_from_entry(entry, log: bool = False) -> Tuple[torch.Tensor,
467
  )
468
  return waveform, sample_rate
469
 
470
- if entry.get("path"):
471
- return load_audio_file(entry["path"], log=log)
472
-
473
  if entry.get("bytes"):
474
- return load_audio_bytes(entry["bytes"], log=log)
 
 
 
 
 
 
 
 
 
475
 
476
  if isinstance(entry, str):
477
- return load_audio_file(entry, log=log)
 
 
 
478
 
479
  raise ValueError("Unsupported audio entry format.")
480
 
@@ -610,7 +684,7 @@ def process_dataset_and_push(
610
  error_message = ""
611
  try:
612
  waveform, sample_rate = prepare_waveform_from_entry(
613
- entry, log=False
614
  )
615
  vad_waveform, denoised_waveform, _, has_speech = process_waveform(
616
  waveform,
@@ -632,15 +706,19 @@ def process_dataset_and_push(
632
  .numpy()
633
  .astype(np.float32)
634
  )
 
 
 
 
635
  except Exception as exc:
636
  ok = False
637
  error_message = str(exc)
638
- output_np = np.zeros(1, dtype=np.float32)
 
 
 
639
 
640
- example[audio_column] = {
641
- "array": output_np,
642
- "sampling_rate": DEFAULT_SAMPLE_RATE,
643
- }
644
  example["chizzler_ok"] = ok
645
  example["chizzler_error"] = error_message
646
 
 
6
  import subprocess
7
  import sys
8
  import tempfile
9
+ import urllib.request
10
  from datetime import datetime
11
  from pathlib import Path
12
  from typing import List, Optional, Tuple
 
442
  return waveform, sample_rate
443
 
444
 
445
+ def _is_http_url(value: str) -> bool:
446
+ return value.startswith("http://") or value.startswith("https://")
447
+
448
+
449
+ def _parse_hf_dataset_uri(uri: str) -> Optional[Tuple[str, str, Optional[str]]]:
450
+ prefix = "hf://datasets/"
451
+ if not uri.startswith(prefix):
452
+ return None
453
+ rest = uri[len(prefix) :]
454
+ if "/" not in rest:
455
+ return None
456
+ repo_id, file_path = rest.split("/", 1)
457
+ revision = None
458
+ if "@" in repo_id:
459
+ repo_id, revision = repo_id.split("@", 1)
460
+ return repo_id, file_path, revision
461
+
462
+
463
+ def load_audio_url(url: str, token: Optional[str], log: bool = False) -> Tuple[torch.Tensor, int]:
464
+ headers = {}
465
+ if token and "huggingface.co" in url:
466
+ headers["Authorization"] = f"Bearer {token}"
467
+ request = urllib.request.Request(url, headers=headers)
468
+ with urllib.request.urlopen(request) as response:
469
+ data = response.read()
470
+ return load_audio_bytes(data, log=log)
471
+
472
+
473
+ def resolve_audio_path(
474
+ path: str, dataset_id: Optional[str], token: Optional[str]
475
+ ) -> str:
476
+ if os.path.exists(path):
477
+ return path
478
+ parsed = _parse_hf_dataset_uri(path)
479
+ if parsed:
480
+ repo_id, filename, revision = parsed
481
+ try:
482
+ return hf_hub_download(
483
+ repo_id=repo_id,
484
+ repo_type="dataset",
485
+ filename=filename,
486
+ revision=revision,
487
+ token=token,
488
+ )
489
+ except Exception:
490
+ return path
491
+ if dataset_id and not os.path.isabs(path):
492
+ try:
493
+ return hf_hub_download(
494
+ repo_id=dataset_id,
495
+ repo_type="dataset",
496
+ filename=path,
497
+ token=token,
498
+ )
499
+ except Exception:
500
+ return path
501
+ return path
502
+
503
+
504
+ def prepare_waveform_from_entry(
505
+ entry,
506
+ log: bool = False,
507
+ dataset_id: Optional[str] = None,
508
+ token: Optional[str] = None,
509
+ ) -> Tuple[torch.Tensor, int]:
510
  if entry is None:
511
  raise ValueError("Empty audio entry.")
512
 
 
532
  )
533
  return waveform, sample_rate
534
 
 
 
 
535
  if entry.get("bytes"):
536
+ audio_bytes = entry["bytes"]
537
+ if not isinstance(audio_bytes, (bytes, bytearray)):
538
+ audio_bytes = bytes(audio_bytes)
539
+ return load_audio_bytes(audio_bytes, log=log)
540
+
541
+ if entry.get("path"):
542
+ path = resolve_audio_path(entry["path"], dataset_id, token)
543
+ if _is_http_url(path):
544
+ return load_audio_url(path, token, log=log)
545
+ return load_audio_file(path, log=log)
546
 
547
  if isinstance(entry, str):
548
+ path = resolve_audio_path(entry, dataset_id, token)
549
+ if _is_http_url(path):
550
+ return load_audio_url(path, token, log=log)
551
+ return load_audio_file(path, log=log)
552
 
553
  raise ValueError("Unsupported audio entry format.")
554
 
 
684
  error_message = ""
685
  try:
686
  waveform, sample_rate = prepare_waveform_from_entry(
687
+ entry, log=False, dataset_id=dataset_id, token=token
688
  )
689
  vad_waveform, denoised_waveform, _, has_speech = process_waveform(
690
  waveform,
 
706
  .numpy()
707
  .astype(np.float32)
708
  )
709
+ output_entry = {
710
+ "array": output_np,
711
+ "sampling_rate": DEFAULT_SAMPLE_RATE,
712
+ }
713
  except Exception as exc:
714
  ok = False
715
  error_message = str(exc)
716
+ output_entry = entry if entry is not None else {
717
+ "array": np.zeros(1, dtype=np.float32),
718
+ "sampling_rate": DEFAULT_SAMPLE_RATE,
719
+ }
720
 
721
+ example[audio_column] = output_entry
 
 
 
722
  example["chizzler_ok"] = ok
723
  example["chizzler_error"] = error_message
724