Spaces:
Sleeping
Sleeping
duck3-create Claude Opus 4.6 commited on
Commit ·
f4b3580
1
Parent(s): 3678e06
Optimize Instagram extraction: 8s -> 5s with persistent browser
Browse files- Dedicated single-thread Playwright executor for thread-safety
- Persistent browser instance pre-warmed at startup (saves ~1.5s cold-start)
- domcontentloaded + wait_for_selector('video[src]') instead of networkidle
- ffmpeg audio extraction before Groq upload (5MB->300KB on Docker)
- Streaming video download for memory efficiency
- Added per-step timing logs for pipeline monitoring
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
main.py
CHANGED
|
@@ -2,6 +2,7 @@ import logging
|
|
| 2 |
import json
|
| 3 |
import urllib.request
|
| 4 |
import tempfile
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
# Load .env file if exists
|
|
@@ -26,12 +27,13 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 26 |
import os
|
| 27 |
import urllib.parse
|
| 28 |
import requests as _requests_mod
|
|
|
|
| 29 |
|
| 30 |
logging.basicConfig(level=logging.INFO)
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
|
| 33 |
app = FastAPI(title="YouTube Transcript Extractor")
|
| 34 |
-
# Version: 3.
|
| 35 |
|
| 36 |
app.add_middleware(
|
| 37 |
CORSMiddleware,
|
|
@@ -55,6 +57,13 @@ else:
|
|
| 55 |
|
| 56 |
_ig_semaphore = asyncio.Semaphore(2) # max 2 concurrent Instagram transcriptions
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 59 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
| 60 |
_proxy_config = None
|
|
@@ -355,20 +364,17 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 355 |
try:
|
| 356 |
logger.info(f"[{api_name}] Listing transcripts for {video_id}")
|
| 357 |
transcript_list = api.list(video_id)
|
| 358 |
-
# Try to find preferred language transcript
|
| 359 |
for lang in languages:
|
| 360 |
for t in transcript_list:
|
| 361 |
if t.language_code == lang:
|
| 362 |
data = t.fetch()
|
| 363 |
return _process_result(data)
|
| 364 |
-
# Take any available transcript
|
| 365 |
for t in transcript_list:
|
| 366 |
data = t.fetch()
|
| 367 |
return _process_result(data)
|
| 368 |
except Exception as e:
|
| 369 |
logger.warning(f"[{api_name}] List fallback failed for {video_id}: {str(e)[:200]}")
|
| 370 |
|
| 371 |
-
# All attempts failed
|
| 372 |
return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
|
| 373 |
|
| 374 |
|
|
@@ -376,87 +382,197 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 376 |
# Instagram video URL extraction: 2-tier cascade
|
| 377 |
# 1. Playwright embed page (cookie-free) - renders /p/{shortcode}/embed/
|
| 378 |
# 2. Playwright full page with cookies (fallback for private/restricted)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
# ---------------------------------------------------------------------------
|
| 380 |
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
try:
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
)
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
if not title:
|
| 421 |
-
og_title = page.query_selector('meta[property="og:title"]')
|
| 422 |
-
if og_title:
|
| 423 |
-
title = og_title.get_attribute('content')
|
| 424 |
-
|
| 425 |
-
browser.close()
|
| 426 |
-
|
| 427 |
-
if video_url:
|
| 428 |
-
logger.info(f"[embed/playwright] Extracted video URL for {shortcode}")
|
| 429 |
-
return video_url, title, None
|
| 430 |
-
return None, title, "No video element found in embed page"
|
| 431 |
except Exception as e:
|
| 432 |
return None, None, f"Embed extraction failed: {str(e)[:200]}"
|
| 433 |
|
| 434 |
|
| 435 |
def _extract_ig_video_url(url):
|
| 436 |
"""Extract Instagram video URL. Tries cookie-free embed first, falls back to authenticated Playwright."""
|
| 437 |
-
# Extract shortcode from URL
|
| 438 |
ig_match = re.search(
|
| 439 |
r"(?:instagram\.com|instagr\.am)/(?:reel|reels|p|tv)/([A-Za-z0-9_-]+)", url
|
| 440 |
)
|
| 441 |
shortcode = ig_match.group(1) if ig_match else None
|
| 442 |
|
| 443 |
if shortcode:
|
| 444 |
-
# Method 1: Embed page rendered in Playwright (cookie-free)
|
| 445 |
logger.info(f"[instagram] Trying embed page (no cookies) for {shortcode}")
|
| 446 |
video_url, title, err = _extract_ig_video_url_embed(shortcode)
|
| 447 |
if video_url:
|
| 448 |
return video_url, title, None
|
| 449 |
logger.info(f"[instagram] Embed failed: {err}")
|
| 450 |
|
| 451 |
-
# Method 2: Playwright with cookies (final fallback)
|
| 452 |
logger.info(f"[instagram] Falling back to Playwright with cookies for {url}")
|
| 453 |
return _extract_ig_video_url_playwright(url)
|
| 454 |
|
| 455 |
|
| 456 |
def _extract_ig_video_url_playwright(url):
|
| 457 |
-
"""Use Playwright
|
| 458 |
import http.cookiejar as _hcj
|
| 459 |
-
from playwright.sync_api import sync_playwright
|
| 460 |
|
| 461 |
_ig_cookie_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "instagram_cookies.txt")
|
| 462 |
if not os.path.exists(_ig_cookie_path):
|
|
@@ -482,120 +598,80 @@ def _extract_ig_video_url_playwright(url):
|
|
| 482 |
if not pw_cookies:
|
| 483 |
return None, None, "Instagram cookies not found. Please provide instagram_cookies.txt."
|
| 484 |
|
| 485 |
-
video_urls = []
|
| 486 |
-
titles = []
|
| 487 |
-
|
| 488 |
-
def _dig_video(obj, vlist, tlist, depth=0):
|
| 489 |
-
if depth > 20:
|
| 490 |
-
return
|
| 491 |
-
if isinstance(obj, dict):
|
| 492 |
-
vu = obj.get('video_url')
|
| 493 |
-
if vu and isinstance(vu, str) and vu.startswith('http'):
|
| 494 |
-
vlist.append(vu)
|
| 495 |
-
vv = obj.get('video_versions')
|
| 496 |
-
if isinstance(vv, list):
|
| 497 |
-
for v in vv:
|
| 498 |
-
if isinstance(v, dict) and v.get('url'):
|
| 499 |
-
vlist.append(v['url'])
|
| 500 |
-
cap = obj.get('caption')
|
| 501 |
-
if isinstance(cap, dict) and cap.get('text'):
|
| 502 |
-
tlist.append(cap['text'][:100])
|
| 503 |
-
cap_edges = obj.get('edge_media_to_caption')
|
| 504 |
-
if isinstance(cap_edges, dict):
|
| 505 |
-
edges = cap_edges.get('edges', [])
|
| 506 |
-
if edges and isinstance(edges[0], dict):
|
| 507 |
-
node = edges[0].get('node', {})
|
| 508 |
-
if isinstance(node, dict) and node.get('text'):
|
| 509 |
-
tlist.append(node['text'][:100])
|
| 510 |
-
for v in obj.values():
|
| 511 |
-
_dig_video(v, vlist, tlist, depth + 1)
|
| 512 |
-
elif isinstance(obj, list):
|
| 513 |
-
for item in obj:
|
| 514 |
-
_dig_video(item, vlist, tlist, depth + 1)
|
| 515 |
-
|
| 516 |
try:
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
ctx = browser.new_context(
|
| 523 |
-
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
| 524 |
-
viewport={'width': 1280, 'height': 720},
|
| 525 |
-
)
|
| 526 |
-
ctx.add_cookies(pw_cookies)
|
| 527 |
-
page = ctx.new_page()
|
| 528 |
-
|
| 529 |
-
def _on_resp(resp):
|
| 530 |
-
if resp.status != 200:
|
| 531 |
-
return
|
| 532 |
-
u = resp.url
|
| 533 |
-
if 'graphql' not in u and '/api/v1/' not in u:
|
| 534 |
-
return
|
| 535 |
-
ct = resp.headers.get('content-type', '')
|
| 536 |
-
if 'json' not in ct and 'text' not in ct:
|
| 537 |
-
return
|
| 538 |
-
try:
|
| 539 |
-
body = resp.text()
|
| 540 |
-
if 'video_url' in body or 'video_versions' in body:
|
| 541 |
-
_dig_video(json.loads(body), video_urls, titles)
|
| 542 |
-
except Exception:
|
| 543 |
-
pass
|
| 544 |
-
|
| 545 |
-
page.on('response', _on_resp)
|
| 546 |
-
page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
| 547 |
-
# Wait up to 5s, exit early if video URL found
|
| 548 |
-
for _ in range(10):
|
| 549 |
-
page.wait_for_timeout(500)
|
| 550 |
-
if video_urls:
|
| 551 |
-
break
|
| 552 |
-
|
| 553 |
-
page_title = page.evaluate("""() => {
|
| 554 |
-
const d = document.querySelector('meta[property="og:description"]');
|
| 555 |
-
if (d) return d.content;
|
| 556 |
-
const t = document.querySelector('meta[property="og:title"]');
|
| 557 |
-
if (t) return t.content;
|
| 558 |
-
return document.title || null;
|
| 559 |
-
}""")
|
| 560 |
-
browser.close()
|
| 561 |
except Exception as e:
|
| 562 |
return None, None, f"Browser extraction failed: {str(e)[:200]}"
|
| 563 |
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
|
| 570 |
def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
|
| 571 |
if not _groq_client:
|
| 572 |
return {"transcript": None, "error": "Instagram transcription not configured (GROQ_API_KEY missing).", "title": None}
|
| 573 |
|
| 574 |
-
|
|
|
|
|
|
|
| 575 |
video_url, title, err = _extract_ig_video_url(url)
|
|
|
|
|
|
|
| 576 |
if err:
|
| 577 |
return {"transcript": None, "error": err, "title": title}
|
| 578 |
|
| 579 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 580 |
-
# Step 2: Download video
|
| 581 |
video_path = os.path.join(tmpdir, 'video.mp4')
|
| 582 |
try:
|
| 583 |
r = _requests_mod.get(video_url, headers={
|
| 584 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 585 |
'Referer': 'https://www.instagram.com/',
|
| 586 |
-
}, timeout=
|
| 587 |
with open(video_path, 'wb') as f:
|
| 588 |
-
|
|
|
|
| 589 |
if os.path.getsize(video_path) < 1024:
|
| 590 |
return {"transcript": None, "error": "Downloaded video is too small.", "title": title}
|
| 591 |
except Exception as e:
|
| 592 |
return {"transcript": None, "error": f"Video download failed: {str(e)[:200]}", "title": title}
|
| 593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
# Step 3: Transcribe with Groq Whisper API
|
|
|
|
|
|
|
| 595 |
try:
|
| 596 |
-
with open(
|
| 597 |
result = _groq_client.audio.transcriptions.create(
|
| 598 |
-
file=(
|
| 599 |
model="whisper-large-v3-turbo",
|
| 600 |
response_format="verbose_json",
|
| 601 |
language=None if language == "auto" else language,
|
|
@@ -604,6 +680,10 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
|
|
| 604 |
except Exception as e:
|
| 605 |
return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
|
| 606 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
# Step 4: Build entries from segments
|
| 608 |
entries = []
|
| 609 |
if hasattr(result, 'segments') and result.segments:
|
|
|
|
| 2 |
import json
|
| 3 |
import urllib.request
|
| 4 |
import tempfile
|
| 5 |
+
import subprocess
|
| 6 |
from pathlib import Path
|
| 7 |
|
| 8 |
# Load .env file if exists
|
|
|
|
| 27 |
import os
|
| 28 |
import urllib.parse
|
| 29 |
import requests as _requests_mod
|
| 30 |
+
import shutil
|
| 31 |
|
| 32 |
logging.basicConfig(level=logging.INFO)
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
app = FastAPI(title="YouTube Transcript Extractor")
|
| 36 |
+
# Version: 3.2.0 - Persistent browser + audio-only extraction + dedicated Playwright thread
|
| 37 |
|
| 38 |
app.add_middleware(
|
| 39 |
CORSMiddleware,
|
|
|
|
| 57 |
|
| 58 |
_ig_semaphore = asyncio.Semaphore(2) # max 2 concurrent Instagram transcriptions
|
| 59 |
|
| 60 |
+
# Check if ffmpeg is available for audio extraction
|
| 61 |
+
_has_ffmpeg = shutil.which('ffmpeg') is not None
|
| 62 |
+
if _has_ffmpeg:
|
| 63 |
+
logger.info("ffmpeg found - will extract audio before transcription")
|
| 64 |
+
else:
|
| 65 |
+
logger.info("ffmpeg not found - will send full video to Groq")
|
| 66 |
+
|
| 67 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 68 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
| 69 |
_proxy_config = None
|
|
|
|
| 364 |
try:
|
| 365 |
logger.info(f"[{api_name}] Listing transcripts for {video_id}")
|
| 366 |
transcript_list = api.list(video_id)
|
|
|
|
| 367 |
for lang in languages:
|
| 368 |
for t in transcript_list:
|
| 369 |
if t.language_code == lang:
|
| 370 |
data = t.fetch()
|
| 371 |
return _process_result(data)
|
|
|
|
| 372 |
for t in transcript_list:
|
| 373 |
data = t.fetch()
|
| 374 |
return _process_result(data)
|
| 375 |
except Exception as e:
|
| 376 |
logger.warning(f"[{api_name}] List fallback failed for {video_id}: {str(e)[:200]}")
|
| 377 |
|
|
|
|
| 378 |
return {"transcript": None, "error": _format_error(last_error or "Unknown error")}
|
| 379 |
|
| 380 |
|
|
|
|
| 382 |
# Instagram video URL extraction: 2-tier cascade
|
| 383 |
# 1. Playwright embed page (cookie-free) - renders /p/{shortcode}/embed/
|
| 384 |
# 2. Playwright full page with cookies (fallback for private/restricted)
|
| 385 |
+
#
|
| 386 |
+
# Optimizations:
|
| 387 |
+
# - Dedicated single-thread executor for Playwright (thread-safety)
|
| 388 |
+
# - Persistent browser instance (avoids ~1.5s cold-start per request)
|
| 389 |
+
# - Pre-warmed at import time via the dedicated thread
|
| 390 |
+
# - domcontentloaded + targeted wait_for_selector (vs networkidle)
|
| 391 |
+
# - ffmpeg audio extraction before Groq upload (5MB->300KB)
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
|
| 394 |
+
# Dedicated single thread for all Playwright operations (Playwright sync API is thread-bound)
|
| 395 |
+
_pw_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix='playwright')
|
| 396 |
+
_ig_browser = None
|
| 397 |
+
_ig_pw = None
|
| 398 |
+
|
| 399 |
|
| 400 |
+
def _pw_init_browser():
|
| 401 |
+
"""Initialize persistent browser. Must run inside _pw_executor thread."""
|
| 402 |
+
global _ig_browser, _ig_pw
|
| 403 |
+
if _ig_browser and _ig_browser.is_connected():
|
| 404 |
+
return _ig_browser
|
| 405 |
+
if _ig_pw:
|
| 406 |
+
try:
|
| 407 |
+
_ig_pw.stop()
|
| 408 |
+
except Exception:
|
| 409 |
+
pass
|
| 410 |
from playwright.sync_api import sync_playwright
|
| 411 |
+
_ig_pw = sync_playwright().start()
|
| 412 |
+
_ig_browser = _ig_pw.chromium.launch(
|
| 413 |
+
headless=True,
|
| 414 |
+
args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
|
| 415 |
+
)
|
| 416 |
+
logger.info("[instagram] Launched persistent Chromium browser")
|
| 417 |
+
return _ig_browser
|
| 418 |
|
| 419 |
+
|
| 420 |
+
# Pre-warm browser at import time (runs in dedicated Playwright thread)
|
| 421 |
+
_pw_executor.submit(_pw_init_browser)
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def _pw_extract_embed(shortcode):
|
| 425 |
+
"""Run inside _pw_executor thread. Extract video URL from embed page."""
|
| 426 |
+
browser = _pw_init_browser()
|
| 427 |
+
ctx = browser.new_context(
|
| 428 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
| 429 |
+
viewport={'width': 1280, 'height': 720},
|
| 430 |
+
)
|
| 431 |
+
page = ctx.new_page()
|
| 432 |
+
page.goto(
|
| 433 |
+
f'https://www.instagram.com/p/{shortcode}/embed/',
|
| 434 |
+
wait_until='domcontentloaded',
|
| 435 |
+
timeout=15000,
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
video_url = None
|
| 439 |
try:
|
| 440 |
+
video_el = page.wait_for_selector('video[src]', timeout=5000)
|
| 441 |
+
if video_el:
|
| 442 |
+
src = video_el.get_attribute('src')
|
| 443 |
+
if src and src.startswith('http'):
|
| 444 |
+
video_url = src
|
| 445 |
+
except Exception:
|
| 446 |
+
video_el = page.query_selector('video')
|
| 447 |
+
if video_el:
|
| 448 |
+
src = video_el.get_attribute('src')
|
| 449 |
+
if src and src.startswith('http'):
|
| 450 |
+
video_url = src
|
| 451 |
+
|
| 452 |
+
title = None
|
| 453 |
+
caption_el = page.query_selector('.Caption, .CaptionUsername')
|
| 454 |
+
if caption_el:
|
| 455 |
+
title = caption_el.inner_text()[:100]
|
| 456 |
+
if not title:
|
| 457 |
+
og_title = page.query_selector('meta[property="og:title"]')
|
| 458 |
+
if og_title:
|
| 459 |
+
title = og_title.get_attribute('content')
|
| 460 |
+
|
| 461 |
+
ctx.close()
|
| 462 |
+
return video_url, title
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def _pw_extract_with_cookies(url, pw_cookies):
|
| 466 |
+
"""Run inside _pw_executor thread. Extract video URL using cookies + GraphQL intercept."""
|
| 467 |
+
browser = _pw_init_browser()
|
| 468 |
+
ctx = browser.new_context(
|
| 469 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
| 470 |
+
viewport={'width': 1280, 'height': 720},
|
| 471 |
+
)
|
| 472 |
+
ctx.add_cookies(pw_cookies)
|
| 473 |
+
page = ctx.new_page()
|
| 474 |
+
|
| 475 |
+
video_urls = []
|
| 476 |
+
titles = []
|
| 477 |
+
|
| 478 |
+
def _dig_video(obj, vlist, tlist, depth=0):
|
| 479 |
+
if depth > 20:
|
| 480 |
+
return
|
| 481 |
+
if isinstance(obj, dict):
|
| 482 |
+
vu = obj.get('video_url')
|
| 483 |
+
if vu and isinstance(vu, str) and vu.startswith('http'):
|
| 484 |
+
vlist.append(vu)
|
| 485 |
+
vv = obj.get('video_versions')
|
| 486 |
+
if isinstance(vv, list):
|
| 487 |
+
for v in vv:
|
| 488 |
+
if isinstance(v, dict) and v.get('url'):
|
| 489 |
+
vlist.append(v['url'])
|
| 490 |
+
cap = obj.get('caption')
|
| 491 |
+
if isinstance(cap, dict) and cap.get('text'):
|
| 492 |
+
tlist.append(cap['text'][:100])
|
| 493 |
+
cap_edges = obj.get('edge_media_to_caption')
|
| 494 |
+
if isinstance(cap_edges, dict):
|
| 495 |
+
edges = cap_edges.get('edges', [])
|
| 496 |
+
if edges and isinstance(edges[0], dict):
|
| 497 |
+
node = edges[0].get('node', {})
|
| 498 |
+
if isinstance(node, dict) and node.get('text'):
|
| 499 |
+
tlist.append(node['text'][:100])
|
| 500 |
+
for v in obj.values():
|
| 501 |
+
_dig_video(v, vlist, tlist, depth + 1)
|
| 502 |
+
elif isinstance(obj, list):
|
| 503 |
+
for item in obj:
|
| 504 |
+
_dig_video(item, vlist, tlist, depth + 1)
|
| 505 |
+
|
| 506 |
+
def _on_resp(resp):
|
| 507 |
+
if resp.status != 200:
|
| 508 |
+
return
|
| 509 |
+
u = resp.url
|
| 510 |
+
if 'graphql' not in u and '/api/v1/' not in u:
|
| 511 |
+
return
|
| 512 |
+
ct = resp.headers.get('content-type', '')
|
| 513 |
+
if 'json' not in ct and 'text' not in ct:
|
| 514 |
+
return
|
| 515 |
+
try:
|
| 516 |
+
body = resp.text()
|
| 517 |
+
if 'video_url' in body or 'video_versions' in body:
|
| 518 |
+
_dig_video(json.loads(body), video_urls, titles)
|
| 519 |
+
except Exception:
|
| 520 |
+
pass
|
| 521 |
+
|
| 522 |
+
page.on('response', _on_resp)
|
| 523 |
+
page.goto(url, wait_until='domcontentloaded', timeout=15000)
|
| 524 |
+
for _ in range(10):
|
| 525 |
+
page.wait_for_timeout(500)
|
| 526 |
+
if video_urls:
|
| 527 |
+
break
|
| 528 |
+
|
| 529 |
+
page_title = page.evaluate("""() => {
|
| 530 |
+
const d = document.querySelector('meta[property="og:description"]');
|
| 531 |
+
if (d) return d.content;
|
| 532 |
+
const t = document.querySelector('meta[property="og:title"]');
|
| 533 |
+
if (t) return t.content;
|
| 534 |
+
return document.title || null;
|
| 535 |
+
}""")
|
| 536 |
+
ctx.close()
|
| 537 |
|
| 538 |
+
title = titles[0] if titles else page_title
|
| 539 |
+
return video_urls[0] if video_urls else None, title
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
def _extract_ig_video_url_embed(shortcode):
|
| 543 |
+
"""Extract video URL from embed page. Dispatches to dedicated Playwright thread."""
|
| 544 |
+
try:
|
| 545 |
+
future = _pw_executor.submit(_pw_extract_embed, shortcode)
|
| 546 |
+
video_url, title = future.result(timeout=25)
|
| 547 |
+
if video_url:
|
| 548 |
+
logger.info(f"[embed/playwright] Extracted video URL for {shortcode}")
|
| 549 |
+
return video_url, title, None
|
| 550 |
+
return None, title, "No video element found in embed page"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
except Exception as e:
|
| 552 |
return None, None, f"Embed extraction failed: {str(e)[:200]}"
|
| 553 |
|
| 554 |
|
| 555 |
def _extract_ig_video_url(url):
|
| 556 |
"""Extract Instagram video URL. Tries cookie-free embed first, falls back to authenticated Playwright."""
|
|
|
|
| 557 |
ig_match = re.search(
|
| 558 |
r"(?:instagram\.com|instagr\.am)/(?:reel|reels|p|tv)/([A-Za-z0-9_-]+)", url
|
| 559 |
)
|
| 560 |
shortcode = ig_match.group(1) if ig_match else None
|
| 561 |
|
| 562 |
if shortcode:
|
|
|
|
| 563 |
logger.info(f"[instagram] Trying embed page (no cookies) for {shortcode}")
|
| 564 |
video_url, title, err = _extract_ig_video_url_embed(shortcode)
|
| 565 |
if video_url:
|
| 566 |
return video_url, title, None
|
| 567 |
logger.info(f"[instagram] Embed failed: {err}")
|
| 568 |
|
|
|
|
| 569 |
logger.info(f"[instagram] Falling back to Playwright with cookies for {url}")
|
| 570 |
return _extract_ig_video_url_playwright(url)
|
| 571 |
|
| 572 |
|
| 573 |
def _extract_ig_video_url_playwright(url):
|
| 574 |
+
"""Use Playwright with cookies to extract video URL. Dispatches to dedicated Playwright thread."""
|
| 575 |
import http.cookiejar as _hcj
|
|
|
|
| 576 |
|
| 577 |
_ig_cookie_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "instagram_cookies.txt")
|
| 578 |
if not os.path.exists(_ig_cookie_path):
|
|
|
|
| 598 |
if not pw_cookies:
|
| 599 |
return None, None, "Instagram cookies not found. Please provide instagram_cookies.txt."
|
| 600 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
try:
|
| 602 |
+
future = _pw_executor.submit(_pw_extract_with_cookies, url, pw_cookies)
|
| 603 |
+
video_url, title = future.result(timeout=25)
|
| 604 |
+
if video_url:
|
| 605 |
+
return video_url, title, None
|
| 606 |
+
return None, title, "Could not extract video URL. The video may be private or unavailable."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
except Exception as e:
|
| 608 |
return None, None, f"Browser extraction failed: {str(e)[:200]}"
|
| 609 |
|
| 610 |
+
|
| 611 |
+
def _extract_audio(video_path, tmpdir):
|
| 612 |
+
"""Extract audio from video using ffmpeg. Returns audio path or original video path."""
|
| 613 |
+
if not _has_ffmpeg:
|
| 614 |
+
return video_path
|
| 615 |
+
audio_path = os.path.join(tmpdir, 'audio.m4a')
|
| 616 |
+
try:
|
| 617 |
+
subprocess.run(
|
| 618 |
+
['ffmpeg', '-i', video_path, '-vn', '-acodec', 'aac', '-b:a', '64k',
|
| 619 |
+
'-y', '-loglevel', 'error', audio_path],
|
| 620 |
+
timeout=15, check=True, capture_output=True,
|
| 621 |
+
)
|
| 622 |
+
if os.path.exists(audio_path) and os.path.getsize(audio_path) > 100:
|
| 623 |
+
logger.info(f"[instagram] Audio extracted: {os.path.getsize(video_path)/1024:.0f}KB -> {os.path.getsize(audio_path)/1024:.0f}KB")
|
| 624 |
+
return audio_path
|
| 625 |
+
except Exception as e:
|
| 626 |
+
logger.warning(f"[instagram] ffmpeg audio extraction failed: {e}, using original video")
|
| 627 |
+
return video_path
|
| 628 |
|
| 629 |
|
| 630 |
def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
|
| 631 |
if not _groq_client:
|
| 632 |
return {"transcript": None, "error": "Instagram transcription not configured (GROQ_API_KEY missing).", "title": None}
|
| 633 |
|
| 634 |
+
t0 = time.time()
|
| 635 |
+
|
| 636 |
+
# Step 1: Extract video URL (embed page -> Playwright with cookies)
|
| 637 |
video_url, title, err = _extract_ig_video_url(url)
|
| 638 |
+
t1 = time.time()
|
| 639 |
+
logger.info(f"[instagram] Video URL extraction took {t1-t0:.1f}s")
|
| 640 |
if err:
|
| 641 |
return {"transcript": None, "error": err, "title": title}
|
| 642 |
|
| 643 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 644 |
+
# Step 2: Download video (streaming)
|
| 645 |
video_path = os.path.join(tmpdir, 'video.mp4')
|
| 646 |
try:
|
| 647 |
r = _requests_mod.get(video_url, headers={
|
| 648 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 649 |
'Referer': 'https://www.instagram.com/',
|
| 650 |
+
}, timeout=30, stream=True)
|
| 651 |
with open(video_path, 'wb') as f:
|
| 652 |
+
for chunk in r.iter_content(chunk_size=65536):
|
| 653 |
+
f.write(chunk)
|
| 654 |
if os.path.getsize(video_path) < 1024:
|
| 655 |
return {"transcript": None, "error": "Downloaded video is too small.", "title": title}
|
| 656 |
except Exception as e:
|
| 657 |
return {"transcript": None, "error": f"Video download failed: {str(e)[:200]}", "title": title}
|
| 658 |
|
| 659 |
+
t2 = time.time()
|
| 660 |
+
logger.info(f"[instagram] Video download took {t2-t1:.1f}s ({os.path.getsize(video_path)/1024:.0f}KB)")
|
| 661 |
+
|
| 662 |
+
# Step 2.5: Extract audio only (much smaller file for Groq upload)
|
| 663 |
+
upload_path = _extract_audio(video_path, tmpdir)
|
| 664 |
+
t2b = time.time()
|
| 665 |
+
if upload_path != video_path:
|
| 666 |
+
logger.info(f"[instagram] Audio extraction took {t2b-t2:.1f}s")
|
| 667 |
+
|
| 668 |
# Step 3: Transcribe with Groq Whisper API
|
| 669 |
+
ext = os.path.splitext(upload_path)[1]
|
| 670 |
+
filename = f"audio{ext}"
|
| 671 |
try:
|
| 672 |
+
with open(upload_path, "rb") as audio_file:
|
| 673 |
result = _groq_client.audio.transcriptions.create(
|
| 674 |
+
file=(filename, audio_file),
|
| 675 |
model="whisper-large-v3-turbo",
|
| 676 |
response_format="verbose_json",
|
| 677 |
language=None if language == "auto" else language,
|
|
|
|
| 680 |
except Exception as e:
|
| 681 |
return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
|
| 682 |
|
| 683 |
+
t3 = time.time()
|
| 684 |
+
logger.info(f"[instagram] Groq transcription took {t3-t2b:.1f}s")
|
| 685 |
+
logger.info(f"[instagram] Total pipeline: {t3-t0:.1f}s")
|
| 686 |
+
|
| 687 |
# Step 4: Build entries from segments
|
| 688 |
entries = []
|
| 689 |
if hasattr(result, 'segments') and result.segments:
|