factorstudios commited on
Commit
e5e57a4
·
verified ·
1 Parent(s): 938c00b

Update transcription_server.py

Browse files
Files changed (1) hide show
  1. transcription_server.py +51 -15
transcription_server.py CHANGED
@@ -78,38 +78,69 @@ def extract_dataset_info(dataset_link: str) -> tuple:
78
 
79
  link = dataset_link.strip()
80
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if "huggingface.co" in link:
82
  # Parse HF URL
83
  parts = link.split("/")
84
  if "datasets" in parts:
85
- idx = parts.index("datasets")
86
- owner = parts[idx + 1]
87
- repo = parts[idx + 2]
88
- # Find filename (after /blob/main/ or /blob/[branch]/)
89
- if "blob" in parts:
90
- blob_idx = parts.index("blob")
91
- filename = "/".join(parts[blob_idx + 2:])
92
- else:
93
- filename = parts[-1]
94
- repo_id = f"{owner}/{repo}"
95
- return repo_id, filename
 
 
 
 
 
 
 
96
  else:
97
  # Assume it's in format: owner/repo/filename
98
  parts = link.split("/")
99
  if len(parts) >= 3:
100
  repo_id = f"{parts[0]}/{parts[1]}"
101
  filename = "/".join(parts[2:])
 
 
 
 
102
  return repo_id, filename
103
 
104
- raise ValueError(f"Cannot parse dataset link: {link}")
 
 
 
 
105
 
106
  async def process_transcription(job_id: str, dataset_link: str, model_size: str):
107
  """Background task to process transcription and upload."""
108
  try:
109
  jobs[job_id]["status"] = "extracting_info"
110
 
111
- # Parse dataset link
112
- repo_id, filename = extract_dataset_info(dataset_link)
 
 
 
 
113
  jobs[job_id]["repo_id"] = repo_id
114
  jobs[job_id]["filename"] = filename
115
 
@@ -350,9 +381,14 @@ async def serve_ui():
350
  <input
351
  type="text"
352
  id="datasetLink"
353
- placeholder="e.g., https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv"
 
354
  required
355
  >
 
 
 
 
356
  </div>
357
 
358
  <div class="form-group">
 
78
 
79
  link = dataset_link.strip()
80
 
81
+ # Validate input
82
+ if not link:
83
+ raise ValueError("Dataset link cannot be empty")
84
+
85
+ if any(char in link for char in ["=", "\n", "\r", "DASHSCOPE", "API", "TOKEN"]):
86
+ raise ValueError(
87
+ "Invalid dataset link format. Please provide a valid Hugging Face dataset URL or path.\n"
88
+ "Examples:\n"
89
+ " https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv\n"
90
+ " factorstudios/movs/movie.mkv"
91
+ )
92
+
93
  if "huggingface.co" in link:
94
  # Parse HF URL
95
  parts = link.split("/")
96
  if "datasets" in parts:
97
+ try:
98
+ idx = parts.index("datasets")
99
+ owner = parts[idx + 1]
100
+ repo = parts[idx + 2]
101
+ # Find filename (after /blob/main/ or /blob/[branch]/)
102
+ if "blob" in parts:
103
+ blob_idx = parts.index("blob")
104
+ filename = "/".join(parts[blob_idx + 2:])
105
+ else:
106
+ filename = parts[-1]
107
+ repo_id = f"{owner}/{repo}"
108
+
109
+ if not filename:
110
+ raise ValueError("No filename found in URL")
111
+
112
+ return repo_id, filename
113
+ except (IndexError, ValueError) as e:
114
+ raise ValueError(f"Invalid Hugging Face dataset URL format: {e}")
115
  else:
116
  # Assume it's in format: owner/repo/filename
117
  parts = link.split("/")
118
  if len(parts) >= 3:
119
  repo_id = f"{parts[0]}/{parts[1]}"
120
  filename = "/".join(parts[2:])
121
+
122
+ if not filename:
123
+ raise ValueError("No filename found in path")
124
+
125
  return repo_id, filename
126
 
127
+ raise ValueError(
128
+ f"Cannot parse dataset link. Please use:\n"
129
+ f" https://huggingface.co/datasets/owner/repo/blob/main/file.mkv\n"
130
+ f" or: owner/repo/file.mkv"
131
+ )
132
 
133
  async def process_transcription(job_id: str, dataset_link: str, model_size: str):
134
  """Background task to process transcription and upload."""
135
  try:
136
  jobs[job_id]["status"] = "extracting_info"
137
 
138
+ # Parse and validate dataset link
139
+ try:
140
+ repo_id, filename = extract_dataset_info(dataset_link)
141
+ except ValueError as e:
142
+ raise ValueError(f"Invalid dataset link: {str(e)}")
143
+
144
  jobs[job_id]["repo_id"] = repo_id
145
  jobs[job_id]["filename"] = filename
146
 
 
381
  <input
382
  type="text"
383
  id="datasetLink"
384
+ placeholder="https://huggingface.co/datasets/factorstudios/movs/blob/main/movie.mkv"
385
+ title="Enter a Hugging Face dataset URL or path (owner/repo/filename.mkv)"
386
  required
387
  >
388
+ <small style="display: block; margin-top: 6px; color: #999; font-size: 12px;">
389
+ Format: https://huggingface.co/datasets/owner/repo/blob/main/filename.mkv<br>
390
+ or: owner/repo/filename.mkv
391
+ </small>
392
  </div>
393
 
394
  <div class="form-group">