duck3-create Claude Opus 4.6 commited on
Commit
a91ee7b
Β·
1 Parent(s): 136083a

Add concurrency limit and retry for YouTube rate limiting

Browse files

- Semaphore limits concurrent YouTube fetches to 3 at a time
- Auto-retry once after 1s delay for rate-limited requests
- Skip retry for genuine "no subtitles" errors
- Extract _format_error helper for consistent error messages

Fixes intermittent failures when submitting 8+ URLs at once.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. main.py +72 -60
main.py CHANGED
@@ -27,6 +27,7 @@ app.add_middleware(
27
  )
28
 
29
  _executor = ThreadPoolExecutor(max_workers=5)
 
30
 
31
  # --- Proxy support (optional PROXY_URL env var) ---
32
  _proxy_url = os.environ.get("PROXY_URL", "")
@@ -162,6 +163,16 @@ def denoise_text(text: str) -> str:
162
  return "\n".join(result)
163
 
164
 
 
 
 
 
 
 
 
 
 
 
165
  def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, keep_newlines: bool = False) -> dict:
166
  languages = [language]
167
  if language == "ko":
@@ -173,56 +184,56 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
173
  if _yt_api_cookies:
174
  apis_to_try.append(("cookies", _yt_api_cookies))
175
 
176
- last_error = None
177
- for api_name, api in apis_to_try:
178
- try:
179
- data = api.fetch(video_id, languages=languages)
180
-
181
- if fmt == "json":
182
- entries = [
183
- {"text": e.text, "start": e.start, "duration": e.duration}
184
- for e in data
185
- ]
186
- if denoise:
187
- deduped = []
188
- prev_text = None
189
- for entry in entries:
190
- t = entry["text"].strip()
191
- if t in KOREAN_FILLERS or NOISE_PATTERN.match(t):
192
- continue
193
- if t == prev_text:
194
- continue
195
- if t:
196
- entry["text"] = t
197
- deduped.append(entry)
198
- prev_text = t
199
- entries = deduped
200
- return {"transcript": entries, "error": None}
201
- else:
202
- separator = "\n" if keep_newlines else " "
203
- text = separator.join(e.text for e in data)
204
- if denoise:
205
- text = denoise_text(text)
206
- if not keep_newlines:
207
- text = " ".join(text.split())
208
- return {"transcript": text, "error": None}
209
- except Exception as e:
210
- last_error = str(e)
211
- logger.warning(f"[{api_name}] Failed for {video_id}: {last_error[:100]}")
212
-
213
- # Don't try cookies fallback if video genuinely has no subtitles
214
- if "No transcripts" in last_error or "disabled" in last_error.lower():
215
- break
 
 
 
 
 
 
 
216
 
217
  # All attempts failed
218
- error_msg = last_error or "Unknown error"
219
- if "No transcripts" in error_msg or "Could not retrieve" in error_msg:
220
- error_msg = f"μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. ({error_msg[:120]})"
221
- elif "disabled" in error_msg.lower():
222
- error_msg = "이 μ˜μƒμ€ μžλ§‰μ΄ λΉ„ν™œμ„±ν™”λ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€."
223
- elif "unavailable" in error_msg.lower():
224
- error_msg = "μ˜μƒμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
225
- return {"transcript": None, "error": error_msg}
226
 
227
 
228
  @app.post("/api/transcripts")
@@ -253,18 +264,19 @@ async def get_transcripts(request: TranscriptRequest):
253
  "error": "μœ νš¨ν•˜μ§€ μ•Šμ€ YouTube URLμž…λ‹ˆλ‹€.",
254
  }
255
 
256
- result, title = await asyncio.gather(
257
- loop.run_in_executor(
258
- _executor,
259
- _fetch_transcript,
260
- video_id,
261
- request.language,
262
- request.denoise,
263
- request.format,
264
- request.keep_newlines,
265
- ),
266
- loop.run_in_executor(_executor, _fetch_title, video_id),
267
- )
 
268
 
269
  return {
270
  "url": url,
 
27
  )
28
 
29
  _executor = ThreadPoolExecutor(max_workers=5)
30
+ _fetch_semaphore = asyncio.Semaphore(3) # max 3 concurrent YouTube fetches
31
 
32
  # --- Proxy support (optional PROXY_URL env var) ---
33
  _proxy_url = os.environ.get("PROXY_URL", "")
 
163
  return "\n".join(result)
164
 
165
 
166
+ def _format_error(error_msg: str) -> str:
167
+ if "No transcripts" in error_msg or "Could not retrieve" in error_msg:
168
+ return f"μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. ({error_msg[:120]})"
169
+ elif "disabled" in error_msg.lower():
170
+ return "이 μ˜μƒμ€ μžλ§‰μ΄ λΉ„ν™œμ„±ν™”λ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€."
171
+ elif "unavailable" in error_msg.lower():
172
+ return "μ˜μƒμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
173
+ return error_msg
174
+
175
+
176
  def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, keep_newlines: bool = False) -> dict:
177
  languages = [language]
178
  if language == "ko":
 
184
  if _yt_api_cookies:
185
  apis_to_try.append(("cookies", _yt_api_cookies))
186
 
187
+ max_retries = 2
188
+ for attempt in range(max_retries):
189
+ last_error = None
190
+ for api_name, api in apis_to_try:
191
+ try:
192
+ data = api.fetch(video_id, languages=languages)
193
+
194
+ if fmt == "json":
195
+ entries = [
196
+ {"text": e.text, "start": e.start, "duration": e.duration}
197
+ for e in data
198
+ ]
199
+ if denoise:
200
+ deduped = []
201
+ prev_text = None
202
+ for entry in entries:
203
+ t = entry["text"].strip()
204
+ if t in KOREAN_FILLERS or NOISE_PATTERN.match(t):
205
+ continue
206
+ if t == prev_text:
207
+ continue
208
+ if t:
209
+ entry["text"] = t
210
+ deduped.append(entry)
211
+ prev_text = t
212
+ entries = deduped
213
+ return {"transcript": entries, "error": None}
214
+ else:
215
+ separator = "\n" if keep_newlines else " "
216
+ text = separator.join(e.text for e in data)
217
+ if denoise:
218
+ text = denoise_text(text)
219
+ if not keep_newlines:
220
+ text = " ".join(text.split())
221
+ return {"transcript": text, "error": None}
222
+ except Exception as e:
223
+ last_error = str(e)
224
+ logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:100]}")
225
+
226
+ # Don't retry if video genuinely has no subtitles
227
+ if "No transcripts" in last_error or "disabled" in last_error.lower():
228
+ return {"transcript": None, "error": _format_error(last_error)}
229
+
230
+ # Rate limit / transient error: retry after delay
231
+ if attempt < max_retries - 1:
232
+ logger.info(f"Retrying {video_id} after 1s delay (attempt {attempt+1})")
233
+ time.sleep(1)
234
 
235
  # All attempts failed
236
+ return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
 
 
 
 
 
 
 
237
 
238
 
239
  @app.post("/api/transcripts")
 
264
  "error": "μœ νš¨ν•˜μ§€ μ•Šμ€ YouTube URLμž…λ‹ˆλ‹€.",
265
  }
266
 
267
+ async with _fetch_semaphore:
268
+ result, title = await asyncio.gather(
269
+ loop.run_in_executor(
270
+ _executor,
271
+ _fetch_transcript,
272
+ video_id,
273
+ request.language,
274
+ request.denoise,
275
+ request.format,
276
+ request.keep_newlines,
277
+ ),
278
+ loop.run_in_executor(_executor, _fetch_title, video_id),
279
+ )
280
 
281
  return {
282
  "url": url,