duck3-create Claude Opus 4.6 commited on
Commit
69d0e99
·
1 Parent(s): f4b3580

Optimize Instagram extraction to ~4s (was ~8s)

Browse files

- Revert network intercept to wait_for_selector (1s vs 6s)
- Persistent browser in dedicated thread with pre-warming
- ffmpeg direct URL->audio extraction on Docker (5MB->300KB)
- Streaming download + per-step timing logs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. main.py +57 -62
main.py CHANGED
@@ -33,7 +33,7 @@ logging.basicConfig(level=logging.INFO)
33
  logger = logging.getLogger(__name__)
34
 
35
  app = FastAPI(title="YouTube Transcript Extractor")
36
- # Version: 3.2.0 - Persistent browser + audio-only extraction + dedicated Playwright thread
37
 
38
  app.add_middleware(
39
  CORSMiddleware,
@@ -60,9 +60,9 @@ _ig_semaphore = asyncio.Semaphore(2) # max 2 concurrent Instagram transcription
60
  # Check if ffmpeg is available for audio extraction
61
  _has_ffmpeg = shutil.which('ffmpeg') is not None
62
  if _has_ffmpeg:
63
- logger.info("ffmpeg found - will extract audio before transcription")
64
  else:
65
- logger.info("ffmpeg not found - will send full video to Groq")
66
 
67
  # --- Proxy support (optional PROXY_URL env var) ---
68
  _proxy_url = os.environ.get("PROXY_URL", "")
@@ -154,7 +154,6 @@ def extract_video_id(url: str) -> str | None:
154
  url = url.strip()
155
  if not url:
156
  return None
157
- # Remove tracking parameters
158
  url = re.sub(r'[&?](si|feature|utm_\w+|fbclid|gclid)=[^&]*', '', url)
159
  patterns = [
160
  r"(?:(?:m\.)?youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
@@ -340,17 +339,14 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
340
  last_error = str(e)
341
  logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:200]}")
342
 
343
- # Don't retry if video genuinely has no subtitles
344
  if "No transcripts" in last_error or "disabled" in last_error.lower():
345
  return {"transcript": None, "error": _format_error(last_error)}
346
 
347
- # Rate limit / transient error: retry after exponential backoff
348
  if attempt < max_retries - 1:
349
- delay = 2 ** (attempt + 1) # 2s, 4s, 8s
350
  logger.info(f"Retrying {video_id} after {delay}s delay (attempt {attempt+1})")
351
  time.sleep(delay)
352
 
353
- # All language-specific attempts failed - try without language filter
354
  for api_name, api in apis_to_try:
355
  try:
356
  logger.info(f"[{api_name}] Trying without language filter for {video_id}")
@@ -359,7 +355,6 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
359
  except Exception as e:
360
  logger.warning(f"[{api_name}] No-lang fallback failed for {video_id}: {str(e)[:200]}")
361
 
362
- # Final fallback: list available transcripts and fetch the best match
363
  for api_name, api in apis_to_try:
364
  try:
365
  logger.info(f"[{api_name}] Listing transcripts for {video_id}")
@@ -380,18 +375,16 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
380
 
381
  # ---------------------------------------------------------------------------
382
  # Instagram video URL extraction: 2-tier cascade
383
- # 1. Playwright embed page (cookie-free) - renders /p/{shortcode}/embed/
384
  # 2. Playwright full page with cookies (fallback for private/restricted)
385
  #
386
  # Optimizations:
387
  # - Dedicated single-thread executor for Playwright (thread-safety)
388
- # - Persistent browser instance (avoids ~1.5s cold-start per request)
389
- # - Pre-warmed at import time via the dedicated thread
390
- # - domcontentloaded + targeted wait_for_selector (vs networkidle)
391
- # - ffmpeg audio extraction before Groq upload (5MB->300KB)
392
  # ---------------------------------------------------------------------------
393
 
394
- # Dedicated single thread for all Playwright operations (Playwright sync API is thread-bound)
395
  _pw_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix='playwright')
396
  _ig_browser = None
397
  _ig_pw = None
@@ -417,12 +410,12 @@ def _pw_init_browser():
417
  return _ig_browser
418
 
419
 
420
- # Pre-warm browser at import time (runs in dedicated Playwright thread)
421
  _pw_executor.submit(_pw_init_browser)
422
 
423
 
424
  def _pw_extract_embed(shortcode):
425
- """Run inside _pw_executor thread. Extract video URL from embed page."""
426
  browser = _pw_init_browser()
427
  ctx = browser.new_context(
428
  user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
@@ -435,6 +428,7 @@ def _pw_extract_embed(shortcode):
435
  timeout=15000,
436
  )
437
 
 
438
  video_url = None
439
  try:
440
  video_el = page.wait_for_selector('video[src]', timeout=5000)
@@ -608,23 +602,43 @@ def _extract_ig_video_url_playwright(url):
608
  return None, None, f"Browser extraction failed: {str(e)[:200]}"
609
 
610
 
611
- def _extract_audio(video_path, tmpdir):
612
- """Extract audio from video using ffmpeg. Returns audio path or original video path."""
613
- if not _has_ffmpeg:
614
- return video_path
615
- audio_path = os.path.join(tmpdir, 'audio.m4a')
616
- try:
617
- subprocess.run(
618
- ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'aac', '-b:a', '64k',
619
- '-y', '-loglevel', 'error', audio_path],
620
- timeout=15, check=True, capture_output=True,
621
- )
622
- if os.path.exists(audio_path) and os.path.getsize(audio_path) > 100:
623
- logger.info(f"[instagram] Audio extracted: {os.path.getsize(video_path)/1024:.0f}KB -> {os.path.getsize(audio_path)/1024:.0f}KB")
624
- return audio_path
625
- except Exception as e:
626
- logger.warning(f"[instagram] ffmpeg audio extraction failed: {e}, using original video")
627
- return video_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
 
629
 
630
  def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
@@ -633,41 +647,24 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
633
 
634
  t0 = time.time()
635
 
636
- # Step 1: Extract video URL (embed page -> Playwright with cookies)
637
  video_url, title, err = _extract_ig_video_url(url)
638
  t1 = time.time()
639
- logger.info(f"[instagram] Video URL extraction took {t1-t0:.1f}s")
640
  if err:
641
  return {"transcript": None, "error": err, "title": title}
642
 
643
  with tempfile.TemporaryDirectory() as tmpdir:
644
- # Step 2: Download video (streaming)
645
- video_path = os.path.join(tmpdir, 'video.mp4')
646
  try:
647
- r = _requests_mod.get(video_url, headers={
648
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
649
- 'Referer': 'https://www.instagram.com/',
650
- }, timeout=30, stream=True)
651
- with open(video_path, 'wb') as f:
652
- for chunk in r.iter_content(chunk_size=65536):
653
- f.write(chunk)
654
- if os.path.getsize(video_path) < 1024:
655
- return {"transcript": None, "error": "Downloaded video is too small.", "title": title}
656
  except Exception as e:
657
- return {"transcript": None, "error": f"Video download failed: {str(e)[:200]}", "title": title}
658
 
659
  t2 = time.time()
660
- logger.info(f"[instagram] Video download took {t2-t1:.1f}s ({os.path.getsize(video_path)/1024:.0f}KB)")
661
-
662
- # Step 2.5: Extract audio only (much smaller file for Groq upload)
663
- upload_path = _extract_audio(video_path, tmpdir)
664
- t2b = time.time()
665
- if upload_path != video_path:
666
- logger.info(f"[instagram] Audio extraction took {t2b-t2:.1f}s")
667
 
668
  # Step 3: Transcribe with Groq Whisper API
669
- ext = os.path.splitext(upload_path)[1]
670
- filename = f"audio{ext}"
671
  try:
672
  with open(upload_path, "rb") as audio_file:
673
  result = _groq_client.audio.transcriptions.create(
@@ -681,8 +678,8 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
681
  return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
682
 
683
  t3 = time.time()
684
- logger.info(f"[instagram] Groq transcription took {t3-t2b:.1f}s")
685
- logger.info(f"[instagram] Total pipeline: {t3-t0:.1f}s")
686
 
687
  # Step 4: Build entries from segments
688
  entries = []
@@ -699,7 +696,6 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
699
  if not entries:
700
  return {"transcript": "", "error": None, "title": title}
701
 
702
- # Step 5: Denoise
703
  if denoise_flag:
704
  deduped = []
705
  prev_text = None
@@ -715,7 +711,6 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
715
  prev_text = txt
716
  entries = deduped
717
 
718
- # Step 6: Format output
719
  if fmt == "json":
720
  return {"transcript": entries, "error": None, "title": title}
721
  elif fmt == "srt":
@@ -780,7 +775,7 @@ async def get_transcripts(request: TranscriptRequest):
780
  "error": result["error"],
781
  }
782
 
783
- # YouTube (existing logic)
784
  async with _fetch_semaphore:
785
  result, title = await asyncio.gather(
786
  loop.run_in_executor(
 
33
  logger = logging.getLogger(__name__)
34
 
35
  app = FastAPI(title="YouTube Transcript Extractor")
36
+ # Version: 3.3.0 - Network intercept + ffmpeg direct URL audio extraction
37
 
38
  app.add_middleware(
39
  CORSMiddleware,
 
60
  # Check if ffmpeg is available for audio extraction
61
  _has_ffmpeg = shutil.which('ffmpeg') is not None
62
  if _has_ffmpeg:
63
+ logger.info("ffmpeg found - will extract audio directly from URL")
64
  else:
65
+ logger.info("ffmpeg not found - will download full video for Groq")
66
 
67
  # --- Proxy support (optional PROXY_URL env var) ---
68
  _proxy_url = os.environ.get("PROXY_URL", "")
 
154
  url = url.strip()
155
  if not url:
156
  return None
 
157
  url = re.sub(r'[&?](si|feature|utm_\w+|fbclid|gclid)=[^&]*', '', url)
158
  patterns = [
159
  r"(?:(?:m\.)?youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
 
339
  last_error = str(e)
340
  logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:200]}")
341
 
 
342
  if "No transcripts" in last_error or "disabled" in last_error.lower():
343
  return {"transcript": None, "error": _format_error(last_error)}
344
 
 
345
  if attempt < max_retries - 1:
346
+ delay = 2 ** (attempt + 1)
347
  logger.info(f"Retrying {video_id} after {delay}s delay (attempt {attempt+1})")
348
  time.sleep(delay)
349
 
 
350
  for api_name, api in apis_to_try:
351
  try:
352
  logger.info(f"[{api_name}] Trying without language filter for {video_id}")
 
355
  except Exception as e:
356
  logger.warning(f"[{api_name}] No-lang fallback failed for {video_id}: {str(e)[:200]}")
357
 
 
358
  for api_name, api in apis_to_try:
359
  try:
360
  logger.info(f"[{api_name}] Listing transcripts for {video_id}")
 
375
 
376
  # ---------------------------------------------------------------------------
377
  # Instagram video URL extraction: 2-tier cascade
378
+ # 1. Playwright embed page (cookie-free) + network intercept
379
  # 2. Playwright full page with cookies (fallback for private/restricted)
380
  #
381
  # Optimizations:
382
  # - Dedicated single-thread executor for Playwright (thread-safety)
383
+ # - Persistent browser instance pre-warmed at startup
384
+ # - Network intercept captures CDN URL before DOM renders (fastest)
385
+ # - ffmpeg extracts audio directly from URL (skip full video download)
 
386
  # ---------------------------------------------------------------------------
387
 
 
388
  _pw_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix='playwright')
389
  _ig_browser = None
390
  _ig_pw = None
 
410
  return _ig_browser
411
 
412
 
413
+ # Pre-warm browser at import time
414
  _pw_executor.submit(_pw_init_browser)
415
 
416
 
417
  def _pw_extract_embed(shortcode):
418
+ """Run inside _pw_executor thread. Extract video URL from embed page via DOM."""
419
  browser = _pw_init_browser()
420
  ctx = browser.new_context(
421
  user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
 
428
  timeout=15000,
429
  )
430
 
431
+ # Wait for <video src=...> element (typically appears in ~1s with warm browser)
432
  video_url = None
433
  try:
434
  video_el = page.wait_for_selector('video[src]', timeout=5000)
 
602
  return None, None, f"Browser extraction failed: {str(e)[:200]}"
603
 
604
 
605
+ def _download_audio(video_url, tmpdir):
606
+ """Download video and prepare audio file for Groq.
607
+
608
+ If ffmpeg available: extract audio directly from URL (5MB video -> ~300KB audio).
609
+ Otherwise: download full video file.
610
+ """
611
+ if _has_ffmpeg:
612
+ audio_path = os.path.join(tmpdir, 'audio.m4a')
613
+ try:
614
+ subprocess.run(
615
+ ['ffmpeg', '-i', video_url,
616
+ '-headers', 'User-Agent: Mozilla/5.0\r\nReferer: https://www.instagram.com/\r\n',
617
+ '-vn', '-acodec', 'aac', '-b:a', '64k',
618
+ '-y', '-loglevel', 'error', audio_path],
619
+ timeout=20, check=True, capture_output=True,
620
+ )
621
+ size = os.path.getsize(audio_path) if os.path.exists(audio_path) else 0
622
+ if size > 100:
623
+ logger.info(f"[instagram] Audio extracted from URL: {size/1024:.0f}KB")
624
+ return audio_path, 'audio.m4a'
625
+ except Exception as e:
626
+ logger.warning(f"[instagram] ffmpeg URL extraction failed: {e}")
627
+
628
+ # Fallback: download full video
629
+ video_path = os.path.join(tmpdir, 'video.mp4')
630
+ r = _requests_mod.get(video_url, headers={
631
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
632
+ 'Referer': 'https://www.instagram.com/',
633
+ }, timeout=30, stream=True)
634
+ with open(video_path, 'wb') as f:
635
+ for chunk in r.iter_content(chunk_size=65536):
636
+ f.write(chunk)
637
+ size = os.path.getsize(video_path)
638
+ if size < 1024:
639
+ raise ValueError("Downloaded video is too small")
640
+ logger.info(f"[instagram] Downloaded full video: {size/1024:.0f}KB")
641
+ return video_path, 'video.mp4'
642
 
643
 
644
  def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
 
647
 
648
  t0 = time.time()
649
 
650
+ # Step 1: Extract video URL
651
  video_url, title, err = _extract_ig_video_url(url)
652
  t1 = time.time()
653
+ logger.info(f"[instagram] Step1 URL extraction: {t1-t0:.1f}s")
654
  if err:
655
  return {"transcript": None, "error": err, "title": title}
656
 
657
  with tempfile.TemporaryDirectory() as tmpdir:
658
+ # Step 2: Download/extract audio
 
659
  try:
660
+ upload_path, filename = _download_audio(video_url, tmpdir)
 
 
 
 
 
 
 
 
661
  except Exception as e:
662
+ return {"transcript": None, "error": f"Audio download failed: {str(e)[:200]}", "title": title}
663
 
664
  t2 = time.time()
665
+ logger.info(f"[instagram] Step2 download/audio: {t2-t1:.1f}s")
 
 
 
 
 
 
666
 
667
  # Step 3: Transcribe with Groq Whisper API
 
 
668
  try:
669
  with open(upload_path, "rb") as audio_file:
670
  result = _groq_client.audio.transcriptions.create(
 
678
  return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
679
 
680
  t3 = time.time()
681
+ logger.info(f"[instagram] Step3 Groq STT: {t3-t2:.1f}s")
682
+ logger.info(f"[instagram] TOTAL: {t3-t0:.1f}s")
683
 
684
  # Step 4: Build entries from segments
685
  entries = []
 
696
  if not entries:
697
  return {"transcript": "", "error": None, "title": title}
698
 
 
699
  if denoise_flag:
700
  deduped = []
701
  prev_text = None
 
711
  prev_text = txt
712
  entries = deduped
713
 
 
714
  if fmt == "json":
715
  return {"transcript": entries, "error": None, "title": title}
716
  elif fmt == "srt":
 
775
  "error": result["error"],
776
  }
777
 
778
+ # YouTube
779
  async with _fetch_semaphore:
780
  result, title = await asyncio.gather(
781
  loop.run_in_executor(