Spaces:
Running
Running
Update transcription_server.py
Browse files- transcription_server.py +51 -15
transcription_server.py
CHANGED
|
@@ -78,38 +78,69 @@ def extract_dataset_info(dataset_link: str) -> tuple:
|
|
| 78 |
|
| 79 |
link = dataset_link.strip()
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
if "huggingface.co" in link:
|
| 82 |
# Parse HF URL
|
| 83 |
parts = link.split("/")
|
| 84 |
if "datasets" in parts:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
else:
|
| 97 |
# Assume it's in format: owner/repo/filename
|
| 98 |
parts = link.split("/")
|
| 99 |
if len(parts) >= 3:
|
| 100 |
repo_id = f"{parts[0]}/{parts[1]}"
|
| 101 |
filename = "/".join(parts[2:])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
return repo_id, filename
|
| 103 |
|
| 104 |
-
raise ValueError(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
async def process_transcription(job_id: str, dataset_link: str, model_size: str):
|
| 107 |
"""Background task to process transcription and upload."""
|
| 108 |
try:
|
| 109 |
jobs[job_id]["status"] = "extracting_info"
|
| 110 |
|
| 111 |
-
# Parse dataset link
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
jobs[job_id]["repo_id"] = repo_id
|
| 114 |
jobs[job_id]["filename"] = filename
|
| 115 |
|
|
@@ -350,9 +381,14 @@ async def serve_ui():
|
|
| 350 |
<input
|
| 351 |
type="text"
|
| 352 |
id="datasetLink"
|
| 353 |
-
placeholder="
|
|
|
|
| 354 |
required
|
| 355 |
>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
</div>
|
| 357 |
|
| 358 |
<div class="form-group">
|
|
|
|
| 78 |
|
| 79 |
link = dataset_link.strip()
|
| 80 |
|
| 81 |
+
# Validate input
|
| 82 |
+
if not link:
|
| 83 |
+
raise ValueError("Dataset link cannot be empty")
|
| 84 |
+
|
| 85 |
+
if any(char in link for char in ["=", "\n", "\r", "DASHSCOPE", "API", "TOKEN"]):
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"Invalid dataset link format. Please provide a valid Hugging Face dataset URL or path.\n"
|
| 88 |
+
"Examples:\n"
|
| 89 |
+
" https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv\n"
|
| 90 |
+
" factorstudios/movs/movie.mkv"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
if "huggingface.co" in link:
|
| 94 |
# Parse HF URL
|
| 95 |
parts = link.split("/")
|
| 96 |
if "datasets" in parts:
|
| 97 |
+
try:
|
| 98 |
+
idx = parts.index("datasets")
|
| 99 |
+
owner = parts[idx + 1]
|
| 100 |
+
repo = parts[idx + 2]
|
| 101 |
+
# Find filename (after /blob/main/ or /blob/[branch]/)
|
| 102 |
+
if "blob" in parts:
|
| 103 |
+
blob_idx = parts.index("blob")
|
| 104 |
+
filename = "/".join(parts[blob_idx + 2:])
|
| 105 |
+
else:
|
| 106 |
+
filename = parts[-1]
|
| 107 |
+
repo_id = f"{owner}/{repo}"
|
| 108 |
+
|
| 109 |
+
if not filename:
|
| 110 |
+
raise ValueError("No filename found in URL")
|
| 111 |
+
|
| 112 |
+
return repo_id, filename
|
| 113 |
+
except (IndexError, ValueError) as e:
|
| 114 |
+
raise ValueError(f"Invalid Hugging Face dataset URL format: {e}")
|
| 115 |
else:
|
| 116 |
# Assume it's in format: owner/repo/filename
|
| 117 |
parts = link.split("/")
|
| 118 |
if len(parts) >= 3:
|
| 119 |
repo_id = f"{parts[0]}/{parts[1]}"
|
| 120 |
filename = "/".join(parts[2:])
|
| 121 |
+
|
| 122 |
+
if not filename:
|
| 123 |
+
raise ValueError("No filename found in path")
|
| 124 |
+
|
| 125 |
return repo_id, filename
|
| 126 |
|
| 127 |
+
raise ValueError(
|
| 128 |
+
f"Cannot parse dataset link. Please use:\n"
|
| 129 |
+
f" https://huggingface.co/datasets/owner/repo/blob/main/file.mkv\n"
|
| 130 |
+
f" or: owner/repo/file.mkv"
|
| 131 |
+
)
|
| 132 |
|
| 133 |
async def process_transcription(job_id: str, dataset_link: str, model_size: str):
|
| 134 |
"""Background task to process transcription and upload."""
|
| 135 |
try:
|
| 136 |
jobs[job_id]["status"] = "extracting_info"
|
| 137 |
|
| 138 |
+
# Parse and validate dataset link
|
| 139 |
+
try:
|
| 140 |
+
repo_id, filename = extract_dataset_info(dataset_link)
|
| 141 |
+
except ValueError as e:
|
| 142 |
+
raise ValueError(f"Invalid dataset link: {str(e)}")
|
| 143 |
+
|
| 144 |
jobs[job_id]["repo_id"] = repo_id
|
| 145 |
jobs[job_id]["filename"] = filename
|
| 146 |
|
|
|
|
| 381 |
<input
|
| 382 |
type="text"
|
| 383 |
id="datasetLink"
|
| 384 |
+
placeholder="https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv"
|
| 385 |
+
title="Enter a Hugging Face dataset URL or path (owner/repo/filename.mkv)"
|
| 386 |
required
|
| 387 |
>
|
| 388 |
+
<small style="display: block; margin-top: 6px; color: #999; font-size: 12px;">
|
| 389 |
+
Format: https://huggingface.co/datasets/owner/repo/blob/main/filename.mkv<br>
|
| 390 |
+
or: owner/repo/filename.mkv
|
| 391 |
+
</small>
|
| 392 |
</div>
|
| 393 |
|
| 394 |
<div class="form-group">
|