duck3-create Claude Opus 4.6 commited on
Commit
3678e06
·
1 Parent(s): 11684f2

Add cookie-free Instagram extraction via embed page

Browse files

Primary method now renders Instagram embed page (/p/{shortcode}/embed/)
in Playwright without any cookies. Falls back to authenticated Playwright
with cookies for private/restricted content. Removes --single-process
flag that caused crashes on Windows.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. main.py +86 -6
main.py CHANGED
@@ -31,7 +31,7 @@ logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
32
 
33
  app = FastAPI(title="YouTube Transcript Extractor")
34
- # Version: 2.0.0 - Instagram Support + Dockerfile
35
 
36
  app.add_middleware(
37
  CORSMiddleware,
@@ -372,8 +372,89 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
372
  return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
373
 
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  def _extract_ig_video_url(url):
376
- """Use Playwright to load Instagram page and capture video_url from GraphQL responses."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  import http.cookiejar as _hcj
378
  from playwright.sync_api import sync_playwright
379
 
@@ -436,7 +517,7 @@ def _extract_ig_video_url(url):
436
  with sync_playwright() as p:
437
  browser = p.chromium.launch(
438
  headless=True,
439
- args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--single-process']
440
  )
441
  ctx = browser.new_context(
442
  user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
@@ -490,7 +571,7 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
490
  if not _groq_client:
491
  return {"transcript": None, "error": "Instagram transcription not configured (GROQ_API_KEY missing).", "title": None}
492
 
493
- # Step 1: Extract video URL via Playwright browser
494
  video_url, title, err = _extract_ig_video_url(url)
495
  if err:
496
  return {"transcript": None, "error": err, "title": title}
@@ -499,8 +580,7 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
499
  # Step 2: Download video
500
  video_path = os.path.join(tmpdir, 'video.mp4')
501
  try:
502
- import requests as dl_requests
503
- r = dl_requests.get(video_url, headers={
504
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
505
  'Referer': 'https://www.instagram.com/',
506
  }, timeout=60)
 
31
  logger = logging.getLogger(__name__)
32
 
33
  app = FastAPI(title="YouTube Transcript Extractor")
34
+ # Version: 3.0.0 - Cookie-free Instagram extraction (embed page + Playwright fallback)
35
 
36
  app.add_middleware(
37
  CORSMiddleware,
 
372
  return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
373
 
374
 
375
+ # ---------------------------------------------------------------------------
376
+ # Instagram video URL extraction: 2-tier cascade
377
+ # 1. Playwright embed page (cookie-free) - renders /p/{shortcode}/embed/
378
+ # 2. Playwright full page with cookies (fallback for private/restricted)
379
+ # ---------------------------------------------------------------------------
380
+
381
+ def _extract_ig_video_url_embed(shortcode):
382
+ """Extract video URL by rendering Instagram embed page in Playwright (no cookies needed).
383
+
384
+ Instagram embed pages are publicly accessible and render video content via
385
+ JavaScript. We launch a headless browser, wait for the <video> element to
386
+ appear, and read its src attribute.
387
+ """
388
+ from playwright.sync_api import sync_playwright
389
+
390
+ try:
391
+ with sync_playwright() as p:
392
+ browser = p.chromium.launch(
393
+ headless=True,
394
+ args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
395
+ )
396
+ ctx = browser.new_context(
397
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
398
+ viewport={'width': 1280, 'height': 720},
399
+ )
400
+ page = ctx.new_page()
401
+ page.goto(
402
+ f'https://www.instagram.com/p/{shortcode}/embed/',
403
+ wait_until='networkidle',
404
+ timeout=30000,
405
+ )
406
+
407
+ # Look for <video> element with a direct CDN src
408
+ video_el = page.query_selector('video')
409
+ video_url = None
410
+ if video_el:
411
+ src = video_el.get_attribute('src')
412
+ if src and src.startswith('http'):
413
+ video_url = src
414
+
415
+ # Try to get title from embed caption
416
+ title = None
417
+ caption_el = page.query_selector('.Caption, .CaptionUsername')
418
+ if caption_el:
419
+ title = caption_el.inner_text()[:100]
420
+ if not title:
421
+ og_title = page.query_selector('meta[property="og:title"]')
422
+ if og_title:
423
+ title = og_title.get_attribute('content')
424
+
425
+ browser.close()
426
+
427
+ if video_url:
428
+ logger.info(f"[embed/playwright] Extracted video URL for {shortcode}")
429
+ return video_url, title, None
430
+ return None, title, "No video element found in embed page"
431
+ except Exception as e:
432
+ return None, None, f"Embed extraction failed: {str(e)[:200]}"
433
+
434
+
435
  def _extract_ig_video_url(url):
436
+ """Extract Instagram video URL. Tries cookie-free embed first, falls back to authenticated Playwright."""
437
+ # Extract shortcode from URL
438
+ ig_match = re.search(
439
+ r"(?:instagram\.com|instagr\.am)/(?:reel|reels|p|tv)/([A-Za-z0-9_-]+)", url
440
+ )
441
+ shortcode = ig_match.group(1) if ig_match else None
442
+
443
+ if shortcode:
444
+ # Method 1: Embed page rendered in Playwright (cookie-free)
445
+ logger.info(f"[instagram] Trying embed page (no cookies) for {shortcode}")
446
+ video_url, title, err = _extract_ig_video_url_embed(shortcode)
447
+ if video_url:
448
+ return video_url, title, None
449
+ logger.info(f"[instagram] Embed failed: {err}")
450
+
451
+ # Method 2: Playwright with cookies (final fallback)
452
+ logger.info(f"[instagram] Falling back to Playwright with cookies for {url}")
453
+ return _extract_ig_video_url_playwright(url)
454
+
455
+
456
+ def _extract_ig_video_url_playwright(url):
457
+ """Use Playwright to load Instagram page with cookies and capture video_url from GraphQL responses."""
458
  import http.cookiejar as _hcj
459
  from playwright.sync_api import sync_playwright
460
 
 
517
  with sync_playwright() as p:
518
  browser = p.chromium.launch(
519
  headless=True,
520
+ args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu']
521
  )
522
  ctx = browser.new_context(
523
  user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
 
571
  if not _groq_client:
572
  return {"transcript": None, "error": "Instagram transcription not configured (GROQ_API_KEY missing).", "title": None}
573
 
574
+ # Step 1: Extract video URL (embed page → Playwright with cookies)
575
  video_url, title, err = _extract_ig_video_url(url)
576
  if err:
577
  return {"transcript": None, "error": err, "title": title}
 
580
  # Step 2: Download video
581
  video_path = os.path.join(tmpdir, 'video.mp4')
582
  try:
583
+ r = _requests_mod.get(video_url, headers={
 
584
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
585
  'Referer': 'https://www.instagram.com/',
586
  }, timeout=60)