Spaces:
Sleeping
Sleeping
duck3-create Claude Opus 4.6 commited on
Commit Β·
a91ee7b
1
Parent(s): 136083a
Add concurrency limit and retry for YouTube rate limiting
Browse files- Semaphore limits concurrent YouTube fetches to 3 at a time
- Auto-retry once after 1s delay for rate-limited requests
- Skip retry for genuine "no subtitles" errors
- Extract _format_error helper for consistent error messages
Fixes intermittent failures when submitting 8+ URLs at once.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
main.py
CHANGED
|
@@ -27,6 +27,7 @@ app.add_middleware(
|
|
| 27 |
)
|
| 28 |
|
| 29 |
_executor = ThreadPoolExecutor(max_workers=5)
|
|
|
|
| 30 |
|
| 31 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 32 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
|
@@ -162,6 +163,16 @@ def denoise_text(text: str) -> str:
|
|
| 162 |
return "\n".join(result)
|
| 163 |
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, keep_newlines: bool = False) -> dict:
|
| 166 |
languages = [language]
|
| 167 |
if language == "ko":
|
|
@@ -173,56 +184,56 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 173 |
if _yt_api_cookies:
|
| 174 |
apis_to_try.append(("cookies", _yt_api_cookies))
|
| 175 |
|
| 176 |
-
|
| 177 |
-
for
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
text =
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
# All attempts failed
|
| 218 |
-
|
| 219 |
-
if "No transcripts" in error_msg or "Could not retrieve" in error_msg:
|
| 220 |
-
error_msg = f"μλ§μ μ°Ύμ μ μμ΅λλ€. ({error_msg[:120]})"
|
| 221 |
-
elif "disabled" in error_msg.lower():
|
| 222 |
-
error_msg = "μ΄ μμμ μλ§μ΄ λΉνμ±νλμ΄ μμ΅λλ€."
|
| 223 |
-
elif "unavailable" in error_msg.lower():
|
| 224 |
-
error_msg = "μμμ μ°Ύμ μ μμ΅λλ€."
|
| 225 |
-
return {"transcript": None, "error": error_msg}
|
| 226 |
|
| 227 |
|
| 228 |
@app.post("/api/transcripts")
|
|
@@ -253,18 +264,19 @@ async def get_transcripts(request: TranscriptRequest):
|
|
| 253 |
"error": "μ ν¨νμ§ μμ YouTube URLμ
λλ€.",
|
| 254 |
}
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
| 268 |
|
| 269 |
return {
|
| 270 |
"url": url,
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
_executor = ThreadPoolExecutor(max_workers=5)
|
| 30 |
+
_fetch_semaphore = asyncio.Semaphore(3) # max 3 concurrent YouTube fetches
|
| 31 |
|
| 32 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 33 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
|
|
|
| 163 |
return "\n".join(result)
|
| 164 |
|
| 165 |
|
| 166 |
+
def _format_error(error_msg: str) -> str:
|
| 167 |
+
if "No transcripts" in error_msg or "Could not retrieve" in error_msg:
|
| 168 |
+
return f"μλ§μ μ°Ύμ μ μμ΅λλ€. ({error_msg[:120]})"
|
| 169 |
+
elif "disabled" in error_msg.lower():
|
| 170 |
+
return "μ΄ μμμ μλ§μ΄ λΉνμ±νλμ΄ μμ΅λλ€."
|
| 171 |
+
elif "unavailable" in error_msg.lower():
|
| 172 |
+
return "μμμ μ°Ύμ μ μμ΅λλ€."
|
| 173 |
+
return error_msg
|
| 174 |
+
|
| 175 |
+
|
| 176 |
def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, keep_newlines: bool = False) -> dict:
|
| 177 |
languages = [language]
|
| 178 |
if language == "ko":
|
|
|
|
| 184 |
if _yt_api_cookies:
|
| 185 |
apis_to_try.append(("cookies", _yt_api_cookies))
|
| 186 |
|
| 187 |
+
max_retries = 2
|
| 188 |
+
for attempt in range(max_retries):
|
| 189 |
+
last_error = None
|
| 190 |
+
for api_name, api in apis_to_try:
|
| 191 |
+
try:
|
| 192 |
+
data = api.fetch(video_id, languages=languages)
|
| 193 |
+
|
| 194 |
+
if fmt == "json":
|
| 195 |
+
entries = [
|
| 196 |
+
{"text": e.text, "start": e.start, "duration": e.duration}
|
| 197 |
+
for e in data
|
| 198 |
+
]
|
| 199 |
+
if denoise:
|
| 200 |
+
deduped = []
|
| 201 |
+
prev_text = None
|
| 202 |
+
for entry in entries:
|
| 203 |
+
t = entry["text"].strip()
|
| 204 |
+
if t in KOREAN_FILLERS or NOISE_PATTERN.match(t):
|
| 205 |
+
continue
|
| 206 |
+
if t == prev_text:
|
| 207 |
+
continue
|
| 208 |
+
if t:
|
| 209 |
+
entry["text"] = t
|
| 210 |
+
deduped.append(entry)
|
| 211 |
+
prev_text = t
|
| 212 |
+
entries = deduped
|
| 213 |
+
return {"transcript": entries, "error": None}
|
| 214 |
+
else:
|
| 215 |
+
separator = "\n" if keep_newlines else " "
|
| 216 |
+
text = separator.join(e.text for e in data)
|
| 217 |
+
if denoise:
|
| 218 |
+
text = denoise_text(text)
|
| 219 |
+
if not keep_newlines:
|
| 220 |
+
text = " ".join(text.split())
|
| 221 |
+
return {"transcript": text, "error": None}
|
| 222 |
+
except Exception as e:
|
| 223 |
+
last_error = str(e)
|
| 224 |
+
logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:100]}")
|
| 225 |
+
|
| 226 |
+
# Don't retry if video genuinely has no subtitles
|
| 227 |
+
if "No transcripts" in last_error or "disabled" in last_error.lower():
|
| 228 |
+
return {"transcript": None, "error": _format_error(last_error)}
|
| 229 |
+
|
| 230 |
+
# Rate limit / transient error: retry after delay
|
| 231 |
+
if attempt < max_retries - 1:
|
| 232 |
+
logger.info(f"Retrying {video_id} after 1s delay (attempt {attempt+1})")
|
| 233 |
+
time.sleep(1)
|
| 234 |
|
| 235 |
# All attempts failed
|
| 236 |
+
return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
@app.post("/api/transcripts")
|
|
|
|
| 264 |
"error": "μ ν¨νμ§ μμ YouTube URLμ
λλ€.",
|
| 265 |
}
|
| 266 |
|
| 267 |
+
async with _fetch_semaphore:
|
| 268 |
+
result, title = await asyncio.gather(
|
| 269 |
+
loop.run_in_executor(
|
| 270 |
+
_executor,
|
| 271 |
+
_fetch_transcript,
|
| 272 |
+
video_id,
|
| 273 |
+
request.language,
|
| 274 |
+
request.denoise,
|
| 275 |
+
request.format,
|
| 276 |
+
request.keep_newlines,
|
| 277 |
+
),
|
| 278 |
+
loop.run_in_executor(_executor, _fetch_title, video_id),
|
| 279 |
+
)
|
| 280 |
|
| 281 |
return {
|
| 282 |
"url": url,
|