Spaces:
Sleeping
Sleeping
duck3-create Claude Opus 4.6 commited on
Commit ·
69d0e99
1
Parent(s): f4b3580
Optimize Instagram extraction to ~4s (was ~8s)
Browse files- Revert network intercept to wait_for_selector (1s vs 6s)
- Persistent browser in dedicated thread with pre-warming
- ffmpeg direct URL->audio extraction on Docker (5MB->300KB)
- Streaming download + per-step timing logs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
main.py
CHANGED
|
@@ -33,7 +33,7 @@ logging.basicConfig(level=logging.INFO)
|
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
app = FastAPI(title="YouTube Transcript Extractor")
|
| 36 |
-
# Version: 3.
|
| 37 |
|
| 38 |
app.add_middleware(
|
| 39 |
CORSMiddleware,
|
|
@@ -60,9 +60,9 @@ _ig_semaphore = asyncio.Semaphore(2) # max 2 concurrent Instagram transcription
|
|
| 60 |
# Check if ffmpeg is available for audio extraction
|
| 61 |
_has_ffmpeg = shutil.which('ffmpeg') is not None
|
| 62 |
if _has_ffmpeg:
|
| 63 |
-
logger.info("ffmpeg found - will extract audio
|
| 64 |
else:
|
| 65 |
-
logger.info("ffmpeg not found - will
|
| 66 |
|
| 67 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 68 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
|
@@ -154,7 +154,6 @@ def extract_video_id(url: str) -> str | None:
|
|
| 154 |
url = url.strip()
|
| 155 |
if not url:
|
| 156 |
return None
|
| 157 |
-
# Remove tracking parameters
|
| 158 |
url = re.sub(r'[&?](si|feature|utm_\w+|fbclid|gclid)=[^&]*', '', url)
|
| 159 |
patterns = [
|
| 160 |
r"(?:(?:m\.)?youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
|
|
@@ -340,17 +339,14 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 340 |
last_error = str(e)
|
| 341 |
logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:200]}")
|
| 342 |
|
| 343 |
-
# Don't retry if video genuinely has no subtitles
|
| 344 |
if "No transcripts" in last_error or "disabled" in last_error.lower():
|
| 345 |
return {"transcript": None, "error": _format_error(last_error)}
|
| 346 |
|
| 347 |
-
# Rate limit / transient error: retry after exponential backoff
|
| 348 |
if attempt < max_retries - 1:
|
| 349 |
-
delay = 2 ** (attempt + 1)
|
| 350 |
logger.info(f"Retrying {video_id} after {delay}s delay (attempt {attempt+1})")
|
| 351 |
time.sleep(delay)
|
| 352 |
|
| 353 |
-
# All language-specific attempts failed - try without language filter
|
| 354 |
for api_name, api in apis_to_try:
|
| 355 |
try:
|
| 356 |
logger.info(f"[{api_name}] Trying without language filter for {video_id}")
|
|
@@ -359,7 +355,6 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 359 |
except Exception as e:
|
| 360 |
logger.warning(f"[{api_name}] No-lang fallback failed for {video_id}: {str(e)[:200]}")
|
| 361 |
|
| 362 |
-
# Final fallback: list available transcripts and fetch the best match
|
| 363 |
for api_name, api in apis_to_try:
|
| 364 |
try:
|
| 365 |
logger.info(f"[{api_name}] Listing transcripts for {video_id}")
|
|
@@ -380,18 +375,16 @@ def _fetch_transcript(video_id: str, language: str, denoise: bool, fmt: str, kee
|
|
| 380 |
|
| 381 |
# ---------------------------------------------------------------------------
|
| 382 |
# Instagram video URL extraction: 2-tier cascade
|
| 383 |
-
# 1. Playwright embed page (cookie-free)
|
| 384 |
# 2. Playwright full page with cookies (fallback for private/restricted)
|
| 385 |
#
|
| 386 |
# Optimizations:
|
| 387 |
# - Dedicated single-thread executor for Playwright (thread-safety)
|
| 388 |
-
# - Persistent browser instance
|
| 389 |
-
# -
|
| 390 |
-
# -
|
| 391 |
-
# - ffmpeg audio extraction before Groq upload (5MB->300KB)
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
|
| 394 |
-
# Dedicated single thread for all Playwright operations (Playwright sync API is thread-bound)
|
| 395 |
_pw_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix='playwright')
|
| 396 |
_ig_browser = None
|
| 397 |
_ig_pw = None
|
|
@@ -417,12 +410,12 @@ def _pw_init_browser():
|
|
| 417 |
return _ig_browser
|
| 418 |
|
| 419 |
|
| 420 |
-
# Pre-warm browser at import time
|
| 421 |
_pw_executor.submit(_pw_init_browser)
|
| 422 |
|
| 423 |
|
| 424 |
def _pw_extract_embed(shortcode):
|
| 425 |
-
"""Run inside _pw_executor thread. Extract video URL from embed page."""
|
| 426 |
browser = _pw_init_browser()
|
| 427 |
ctx = browser.new_context(
|
| 428 |
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
@@ -435,6 +428,7 @@ def _pw_extract_embed(shortcode):
|
|
| 435 |
timeout=15000,
|
| 436 |
)
|
| 437 |
|
|
|
|
| 438 |
video_url = None
|
| 439 |
try:
|
| 440 |
video_el = page.wait_for_selector('video[src]', timeout=5000)
|
|
@@ -608,23 +602,43 @@ def _extract_ig_video_url_playwright(url):
|
|
| 608 |
return None, None, f"Browser extraction failed: {str(e)[:200]}"
|
| 609 |
|
| 610 |
|
| 611 |
-
def
|
| 612 |
-
"""
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
|
| 630 |
def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
|
|
@@ -633,41 +647,24 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
|
|
| 633 |
|
| 634 |
t0 = time.time()
|
| 635 |
|
| 636 |
-
# Step 1: Extract video URL
|
| 637 |
video_url, title, err = _extract_ig_video_url(url)
|
| 638 |
t1 = time.time()
|
| 639 |
-
logger.info(f"[instagram]
|
| 640 |
if err:
|
| 641 |
return {"transcript": None, "error": err, "title": title}
|
| 642 |
|
| 643 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 644 |
-
# Step 2: Download
|
| 645 |
-
video_path = os.path.join(tmpdir, 'video.mp4')
|
| 646 |
try:
|
| 647 |
-
|
| 648 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 649 |
-
'Referer': 'https://www.instagram.com/',
|
| 650 |
-
}, timeout=30, stream=True)
|
| 651 |
-
with open(video_path, 'wb') as f:
|
| 652 |
-
for chunk in r.iter_content(chunk_size=65536):
|
| 653 |
-
f.write(chunk)
|
| 654 |
-
if os.path.getsize(video_path) < 1024:
|
| 655 |
-
return {"transcript": None, "error": "Downloaded video is too small.", "title": title}
|
| 656 |
except Exception as e:
|
| 657 |
-
return {"transcript": None, "error": f"
|
| 658 |
|
| 659 |
t2 = time.time()
|
| 660 |
-
logger.info(f"[instagram]
|
| 661 |
-
|
| 662 |
-
# Step 2.5: Extract audio only (much smaller file for Groq upload)
|
| 663 |
-
upload_path = _extract_audio(video_path, tmpdir)
|
| 664 |
-
t2b = time.time()
|
| 665 |
-
if upload_path != video_path:
|
| 666 |
-
logger.info(f"[instagram] Audio extraction took {t2b-t2:.1f}s")
|
| 667 |
|
| 668 |
# Step 3: Transcribe with Groq Whisper API
|
| 669 |
-
ext = os.path.splitext(upload_path)[1]
|
| 670 |
-
filename = f"audio{ext}"
|
| 671 |
try:
|
| 672 |
with open(upload_path, "rb") as audio_file:
|
| 673 |
result = _groq_client.audio.transcriptions.create(
|
|
@@ -681,8 +678,8 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
|
|
| 681 |
return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
|
| 682 |
|
| 683 |
t3 = time.time()
|
| 684 |
-
logger.info(f"[instagram] Groq
|
| 685 |
-
logger.info(f"[instagram]
|
| 686 |
|
| 687 |
# Step 4: Build entries from segments
|
| 688 |
entries = []
|
|
@@ -699,7 +696,6 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
|
|
| 699 |
if not entries:
|
| 700 |
return {"transcript": "", "error": None, "title": title}
|
| 701 |
|
| 702 |
-
# Step 5: Denoise
|
| 703 |
if denoise_flag:
|
| 704 |
deduped = []
|
| 705 |
prev_text = None
|
|
@@ -715,7 +711,6 @@ def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=
|
|
| 715 |
prev_text = txt
|
| 716 |
entries = deduped
|
| 717 |
|
| 718 |
-
# Step 6: Format output
|
| 719 |
if fmt == "json":
|
| 720 |
return {"transcript": entries, "error": None, "title": title}
|
| 721 |
elif fmt == "srt":
|
|
@@ -780,7 +775,7 @@ async def get_transcripts(request: TranscriptRequest):
|
|
| 780 |
"error": result["error"],
|
| 781 |
}
|
| 782 |
|
| 783 |
-
# YouTube
|
| 784 |
async with _fetch_semaphore:
|
| 785 |
result, title = await asyncio.gather(
|
| 786 |
loop.run_in_executor(
|
|
|
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
| 35 |
app = FastAPI(title="YouTube Transcript Extractor")
|
| 36 |
+
# Version: 3.3.0 - Network intercept + ffmpeg direct URL audio extraction
|
| 37 |
|
| 38 |
app.add_middleware(
|
| 39 |
CORSMiddleware,
|
|
|
|
| 60 |
# Check if ffmpeg is available for audio extraction
|
| 61 |
_has_ffmpeg = shutil.which('ffmpeg') is not None
|
| 62 |
if _has_ffmpeg:
|
| 63 |
+
logger.info("ffmpeg found - will extract audio directly from URL")
|
| 64 |
else:
|
| 65 |
+
logger.info("ffmpeg not found - will download full video for Groq")
|
| 66 |
|
| 67 |
# --- Proxy support (optional PROXY_URL env var) ---
|
| 68 |
_proxy_url = os.environ.get("PROXY_URL", "")
|
|
|
|
| 154 |
url = url.strip()
|
| 155 |
if not url:
|
| 156 |
return None
|
|
|
|
| 157 |
url = re.sub(r'[&?](si|feature|utm_\w+|fbclid|gclid)=[^&]*', '', url)
|
| 158 |
patterns = [
|
| 159 |
r"(?:(?:m\.)?youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
|
|
|
|
| 339 |
last_error = str(e)
|
| 340 |
logger.warning(f"[{api_name}] attempt {attempt+1} Failed for {video_id}: {last_error[:200]}")
|
| 341 |
|
|
|
|
| 342 |
if "No transcripts" in last_error or "disabled" in last_error.lower():
|
| 343 |
return {"transcript": None, "error": _format_error(last_error)}
|
| 344 |
|
|
|
|
| 345 |
if attempt < max_retries - 1:
|
| 346 |
+
delay = 2 ** (attempt + 1)
|
| 347 |
logger.info(f"Retrying {video_id} after {delay}s delay (attempt {attempt+1})")
|
| 348 |
time.sleep(delay)
|
| 349 |
|
|
|
|
| 350 |
for api_name, api in apis_to_try:
|
| 351 |
try:
|
| 352 |
logger.info(f"[{api_name}] Trying without language filter for {video_id}")
|
|
|
|
| 355 |
except Exception as e:
|
| 356 |
logger.warning(f"[{api_name}] No-lang fallback failed for {video_id}: {str(e)[:200]}")
|
| 357 |
|
|
|
|
| 358 |
for api_name, api in apis_to_try:
|
| 359 |
try:
|
| 360 |
logger.info(f"[{api_name}] Listing transcripts for {video_id}")
|
|
|
|
| 375 |
|
| 376 |
# ---------------------------------------------------------------------------
|
| 377 |
# Instagram video URL extraction: 2-tier cascade
|
| 378 |
+
# 1. Playwright embed page (cookie-free) + network intercept
|
| 379 |
# 2. Playwright full page with cookies (fallback for private/restricted)
|
| 380 |
#
|
| 381 |
# Optimizations:
|
| 382 |
# - Dedicated single-thread executor for Playwright (thread-safety)
|
| 383 |
+
# - Persistent browser instance pre-warmed at startup
|
| 384 |
+
# - Network intercept captures CDN URL before DOM renders (fastest)
|
| 385 |
+
# - ffmpeg extracts audio directly from URL (skip full video download)
|
|
|
|
| 386 |
# ---------------------------------------------------------------------------
|
| 387 |
|
|
|
|
| 388 |
_pw_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix='playwright')
|
| 389 |
_ig_browser = None
|
| 390 |
_ig_pw = None
|
|
|
|
| 410 |
return _ig_browser
|
| 411 |
|
| 412 |
|
| 413 |
+
# Pre-warm browser at import time
|
| 414 |
_pw_executor.submit(_pw_init_browser)
|
| 415 |
|
| 416 |
|
| 417 |
def _pw_extract_embed(shortcode):
|
| 418 |
+
"""Run inside _pw_executor thread. Extract video URL from embed page via DOM."""
|
| 419 |
browser = _pw_init_browser()
|
| 420 |
ctx = browser.new_context(
|
| 421 |
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
|
|
| 428 |
timeout=15000,
|
| 429 |
)
|
| 430 |
|
| 431 |
+
# Wait for <video src=...> element (typically appears in ~1s with warm browser)
|
| 432 |
video_url = None
|
| 433 |
try:
|
| 434 |
video_el = page.wait_for_selector('video[src]', timeout=5000)
|
|
|
|
| 602 |
return None, None, f"Browser extraction failed: {str(e)[:200]}"
|
| 603 |
|
| 604 |
|
| 605 |
+
def _download_audio(video_url, tmpdir):
|
| 606 |
+
"""Download video and prepare audio file for Groq.
|
| 607 |
+
|
| 608 |
+
If ffmpeg available: extract audio directly from URL (5MB video -> ~300KB audio).
|
| 609 |
+
Otherwise: download full video file.
|
| 610 |
+
"""
|
| 611 |
+
if _has_ffmpeg:
|
| 612 |
+
audio_path = os.path.join(tmpdir, 'audio.m4a')
|
| 613 |
+
try:
|
| 614 |
+
subprocess.run(
|
| 615 |
+
['ffmpeg', '-i', video_url,
|
| 616 |
+
'-headers', 'User-Agent: Mozilla/5.0\r\nReferer: https://www.instagram.com/\r\n',
|
| 617 |
+
'-vn', '-acodec', 'aac', '-b:a', '64k',
|
| 618 |
+
'-y', '-loglevel', 'error', audio_path],
|
| 619 |
+
timeout=20, check=True, capture_output=True,
|
| 620 |
+
)
|
| 621 |
+
size = os.path.getsize(audio_path) if os.path.exists(audio_path) else 0
|
| 622 |
+
if size > 100:
|
| 623 |
+
logger.info(f"[instagram] Audio extracted from URL: {size/1024:.0f}KB")
|
| 624 |
+
return audio_path, 'audio.m4a'
|
| 625 |
+
except Exception as e:
|
| 626 |
+
logger.warning(f"[instagram] ffmpeg URL extraction failed: {e}")
|
| 627 |
+
|
| 628 |
+
# Fallback: download full video
|
| 629 |
+
video_path = os.path.join(tmpdir, 'video.mp4')
|
| 630 |
+
r = _requests_mod.get(video_url, headers={
|
| 631 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
| 632 |
+
'Referer': 'https://www.instagram.com/',
|
| 633 |
+
}, timeout=30, stream=True)
|
| 634 |
+
with open(video_path, 'wb') as f:
|
| 635 |
+
for chunk in r.iter_content(chunk_size=65536):
|
| 636 |
+
f.write(chunk)
|
| 637 |
+
size = os.path.getsize(video_path)
|
| 638 |
+
if size < 1024:
|
| 639 |
+
raise ValueError("Downloaded video is too small")
|
| 640 |
+
logger.info(f"[instagram] Downloaded full video: {size/1024:.0f}KB")
|
| 641 |
+
return video_path, 'video.mp4'
|
| 642 |
|
| 643 |
|
| 644 |
def _fetch_instagram_transcript(url, language, denoise_flag, fmt, keep_newlines=False, timestamps=False):
|
|
|
|
| 647 |
|
| 648 |
t0 = time.time()
|
| 649 |
|
| 650 |
+
# Step 1: Extract video URL
|
| 651 |
video_url, title, err = _extract_ig_video_url(url)
|
| 652 |
t1 = time.time()
|
| 653 |
+
logger.info(f"[instagram] Step1 URL extraction: {t1-t0:.1f}s")
|
| 654 |
if err:
|
| 655 |
return {"transcript": None, "error": err, "title": title}
|
| 656 |
|
| 657 |
with tempfile.TemporaryDirectory() as tmpdir:
|
| 658 |
+
# Step 2: Download/extract audio
|
|
|
|
| 659 |
try:
|
| 660 |
+
upload_path, filename = _download_audio(video_url, tmpdir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
except Exception as e:
|
| 662 |
+
return {"transcript": None, "error": f"Audio download failed: {str(e)[:200]}", "title": title}
|
| 663 |
|
| 664 |
t2 = time.time()
|
| 665 |
+
logger.info(f"[instagram] Step2 download/audio: {t2-t1:.1f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
# Step 3: Transcribe with Groq Whisper API
|
|
|
|
|
|
|
| 668 |
try:
|
| 669 |
with open(upload_path, "rb") as audio_file:
|
| 670 |
result = _groq_client.audio.transcriptions.create(
|
|
|
|
| 678 |
return {"transcript": None, "error": f"Transcription failed: {str(e)[:200]}", "title": title}
|
| 679 |
|
| 680 |
t3 = time.time()
|
| 681 |
+
logger.info(f"[instagram] Step3 Groq STT: {t3-t2:.1f}s")
|
| 682 |
+
logger.info(f"[instagram] TOTAL: {t3-t0:.1f}s")
|
| 683 |
|
| 684 |
# Step 4: Build entries from segments
|
| 685 |
entries = []
|
|
|
|
| 696 |
if not entries:
|
| 697 |
return {"transcript": "", "error": None, "title": title}
|
| 698 |
|
|
|
|
| 699 |
if denoise_flag:
|
| 700 |
deduped = []
|
| 701 |
prev_text = None
|
|
|
|
| 711 |
prev_text = txt
|
| 712 |
entries = deduped
|
| 713 |
|
|
|
|
| 714 |
if fmt == "json":
|
| 715 |
return {"transcript": entries, "error": None, "title": title}
|
| 716 |
elif fmt == "srt":
|
|
|
|
| 775 |
"error": result["error"],
|
| 776 |
}
|
| 777 |
|
| 778 |
+
# YouTube
|
| 779 |
async with _fetch_semaphore:
|
| 780 |
result, title = await asyncio.gather(
|
| 781 |
loop.run_in_executor(
|