Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

Ahmed Mostafa commited on 18 days ago

Commit

4b5adfb

1 Parent(s): 629e216

feat:fix "failed title fetch via yt-dlp, failed transcript retries, failed duration fetch via yt-dlp"

Browse files

Files changed (3) hide show

src/api/__pycache__/notes_routes.cpython-312.pyc +0 -0
src/api/downloader.py +58 -63
src/api/notes_routes.py +179 -157

src/api/__pycache__/notes_routes.cpython-312.pyc CHANGED Viewed

Binary files a/src/api/__pycache__/notes_routes.cpython-312.pyc and b/src/api/__pycache__/notes_routes.cpython-312.pyc differ

src/api/downloader.py CHANGED Viewed

@@ -1,97 +1,100 @@
 import logging
 import os
 import re
 import time
 import urllib.request
-import json
 logger = logging.getLogger(__name__)
-# Browser-like headers to reduce YouTube SSL/rate-limit rejections
-_YT_HEADERS = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/124.0.0.0 Safari/537.36"
-    ),
-    "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
-    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-}
 class YouTubeDownloader:
     def __init__(self):
-        # سحب المفاتيح من الـ Environment
         self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
-        self._supadata_key   = os.environ.get("SUPADATA_API_KEY", "").strip()
     def get_transcript(self, url: str) -> str:
         video_id = self._extract_video_id(url)
-        logger.info(f"🔍 Pipeline for video ID: {video_id}")
-        # 1. الخطة أ: YouTube Transcript API (لو فيه ترجمة جاهزة)
-        # Retry up to 3 times with a short back-off to survive transient SSL hiccups
-        for _attempt in range(3):
             try:
                 from youtube_transcript_api import YouTubeTranscriptApi
                 transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-                # Prefer manual captions; fall back to auto-generated in any language
                 try:
                     transcript = transcript_list.find_manually_created_transcript(["en", "ar", "en-US"])
                 except Exception:
                     try:
                         transcript = transcript_list.find_generated_transcript(["en", "ar", "en-US"])
                     except Exception:
-                        # Accept absolutely any available transcript
                         transcript = next(iter(transcript_list))
                 entries = transcript.fetch()
-                logger.info(f"✅ Plan A Success! (attempt {_attempt + 1})")
-                return " ".join(e["text"] for e in entries).strip()
-            except Exception as e:
-                logger.warning(f"⚠️ Plan A attempt {_attempt + 1} failed: {e}")
-                if _attempt < 2:
-                    time.sleep(1.5 * (_attempt + 1))  # 1.5 s, 3 s back-off
-        # 2. الخطة ب: Supadata (المنقذ الأول - بإصلاح الـ User-Agent)
         if self._supadata_key:
             try:
-                logger.info("🚀 Plan B: Calling Supadata...")
                 clean_url = f"https://www.youtube.com/watch?v={video_id}"
-                # إضافة Headers عشان نهرب من الـ 403 Forbidden
                 headers = {
                     "x-api-key": self._supadata_key,
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                 }
                 req = urllib.request.Request(
                     f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
-                    headers=headers
                 )
                 with urllib.request.urlopen(req, timeout=30) as resp:
                     data = json.loads(resp.read())
                     text = data.get("content", "").strip()
                     if text:
-                        logger.info("✅ Plan B Success!")
                         return text
-            except Exception as e:
-                logger.error(f"❌ Plan B (Supadata) failed: {e}")
-        # 3. الخطة ج: AssemblyAI — raw REST (bypasses SDK serialization bug)
         if self._assemblyai_key:
             try:
                 import httpx
                 import yt_dlp
                 clean_url = f"https://www.youtube.com/watch?v={video_id}"
-                ydl_opts = {'format': 'bestaudio', 'noplaylist': True, 'quiet': True}
                 with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                     info_dict = ydl.extract_info(clean_url, download=False)
-                    direct_audio_url = info_dict.get('url')
-                logger.info(f"Extracted direct audio URL: {str(direct_audio_url)[:50]}...")
                 aai_headers = {
                     "authorization": self._assemblyai_key,
@@ -99,17 +102,14 @@ class YouTubeDownloader:
                 }
                 payload = {
                     "audio_url": direct_audio_url,
-                    "speech_models": ["universal-2"],   # REQUIRED as a list by the API
                     "language_detection": True,
                 }
-                logger.info(
-                    f"ATTEMPTING PLAN C WITH PAYLOAD: {json.dumps(payload)}"
-                )
                 print(f"[PLAN C] PAYLOAD SENT TO ASSEMBLYAI: {json.dumps(payload)}")
                 with httpx.Client(timeout=30) as client:
-                    # Step 1 — submit the job
                     resp = client.post(
                         "https://api.assemblyai.com/v2/transcript",
                         headers=aai_headers,
@@ -117,37 +117,32 @@ class YouTubeDownloader:
                     )
                     resp.raise_for_status()
                     transcript_id = resp.json()["id"]
-                    logger.info(f"✅ Plan C job submitted. ID: {transcript_id}")
                     print(f"[PLAN C] Job ID: {transcript_id}")
-                    # Step 2 — poll until completed (max ~5 min)
                     polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
-                    for _ in range(60):          # 60 × 5 s = 5 min max
                         time.sleep(5)
                         poll = client.get(polling_url, headers=aai_headers)
                         poll.raise_for_status()
                         result = poll.json()
                         status = result.get("status")
                         if status == "completed":
-                            logger.info("✅ Plan C Success!")
                             return result["text"]
-                        elif status == "error":
-                            logger.warning(
-                                f"⚠️ Plan C API Error: {result.get('error')}"
-                            )
                             break
-                        # status == "processing" or "queued" → keep polling
-            except Exception as e:
-                logger.error(f"❌ Plan C (AssemblyAI REST) failed: {e}")
-                print(f"[PLAN C] EXCEPTION: {e}")
-        raise RuntimeError(f"❌ All strategies exhausted for {video_id}. No transcript found.")
     def _extract_video_id(self, url: str) -> str:
-        # يدعم كل أنواع روابط يوتيوب
         match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
         return match.group(1) if match else "unknown"
     def cleanup(self, path=None):
-        pass

+import json
 import logging
 import os
 import re
 import time
 import urllib.request
 logger = logging.getLogger(__name__)
+_FAST_FAIL_SSL_MARKERS = (
+    "UNEXPECTED_EOF_WHILE_READING",
+    "SSLEOFError",
+    "EOF occurred in violation of protocol",
+)
+def _is_fast_fail_ssl_error(exc: Exception) -> bool:
+    error_text = str(exc)
+    return any(marker in error_text for marker in _FAST_FAIL_SSL_MARKERS)
 class YouTubeDownloader:
     def __init__(self):
         self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
+        self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
     def get_transcript(self, url: str) -> str:
         video_id = self._extract_video_id(url)
+        logger.info("Pipeline for video ID: %s", video_id)
+        # Plan A: use YouTube Transcript API first, but avoid retrying known SSL EOF failures.
+        for attempt in range(3):
             try:
                 from youtube_transcript_api import YouTubeTranscriptApi
                 transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                 try:
                     transcript = transcript_list.find_manually_created_transcript(["en", "ar", "en-US"])
                 except Exception:
                     try:
                         transcript = transcript_list.find_generated_transcript(["en", "ar", "en-US"])
                     except Exception:
                         transcript = next(iter(transcript_list))
                 entries = transcript.fetch()
+                logger.info("Plan A success (attempt %s)", attempt + 1)
+                return " ".join(entry["text"] for entry in entries).strip()
+            except Exception as exc:
+                logger.warning("Plan A attempt %s failed: %s", attempt + 1, exc)
+                if _is_fast_fail_ssl_error(exc):
+                    logger.info("Plan A fast-fail: SSL transport error detected, switching to fallback immediately.")
+                    break
+                if attempt < 2:
+                    time.sleep(1.5 * (attempt + 1))
+        # Plan B: Supadata transcript API.
         if self._supadata_key:
             try:
+                logger.info("Plan B: calling Supadata")
                 clean_url = f"https://www.youtube.com/watch?v={video_id}"
                 headers = {
                     "x-api-key": self._supadata_key,
+                    "User-Agent": (
+                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                        "AppleWebKit/537.36 (KHTML, like Gecko) "
+                        "Chrome/120.0.0.0 Safari/537.36"
+                    ),
                 }
                 req = urllib.request.Request(
                     f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
+                    headers=headers,
                 )
                 with urllib.request.urlopen(req, timeout=30) as resp:
                     data = json.loads(resp.read())
                     text = data.get("content", "").strip()
                     if text:
+                        logger.info("Plan B success")
                         return text
+            except Exception as exc:
+                logger.error("Plan B (Supadata) failed: %s", exc)
+        # Plan C: AssemblyAI raw REST fallback.
         if self._assemblyai_key:
             try:
                 import httpx
                 import yt_dlp
                 clean_url = f"https://www.youtube.com/watch?v={video_id}"
+                ydl_opts = {"format": "bestaudio", "noplaylist": True, "quiet": True}
                 with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                     info_dict = ydl.extract_info(clean_url, download=False)
+                    direct_audio_url = info_dict.get("url")
+                logger.info("Extracted direct audio URL: %s...", str(direct_audio_url)[:50])
                 aai_headers = {
                     "authorization": self._assemblyai_key,
                 }
                 payload = {
                     "audio_url": direct_audio_url,
+                    "speech_models": ["universal-2"],
                     "language_detection": True,
                 }
+                logger.info("Attempting Plan C with payload: %s", json.dumps(payload))
                 print(f"[PLAN C] PAYLOAD SENT TO ASSEMBLYAI: {json.dumps(payload)}")
                 with httpx.Client(timeout=30) as client:
                     resp = client.post(
                         "https://api.assemblyai.com/v2/transcript",
                         headers=aai_headers,
                     )
                     resp.raise_for_status()
                     transcript_id = resp.json()["id"]
+                    logger.info("Plan C job submitted. ID: %s", transcript_id)
                     print(f"[PLAN C] Job ID: {transcript_id}")
                     polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
+                    for _ in range(60):
                         time.sleep(5)
                         poll = client.get(polling_url, headers=aai_headers)
                         poll.raise_for_status()
                         result = poll.json()
                         status = result.get("status")
                         if status == "completed":
+                            logger.info("Plan C success")
                             return result["text"]
+                        if status == "error":
+                            logger.warning("Plan C API error: %s", result.get("error"))
                             break
+            except Exception as exc:
+                logger.error("Plan C (AssemblyAI REST) failed: %s", exc)
+                print(f"[PLAN C] EXCEPTION: {exc}")
+        raise RuntimeError(f"All strategies exhausted for {video_id}. No transcript found.")
     def _extract_video_id(self, url: str) -> str:
         match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
         return match.group(1) if match else "unknown"
     def cleanup(self, path=None):
+        pass

src/api/notes_routes.py CHANGED Viewed

@@ -1,57 +1,95 @@
-import uuid
-import re
-import logging
-import os
 import json
 import urllib.request
 from datetime import datetime
-from typing import List, Optional, Dict
-from pathlib import Path
-from fastapi import APIRouter, Depends, HTTPException, status, Query, BackgroundTasks
-from pydantic import BaseModel, HttpUrl, Field
-from src.db.firebase import get_firebase_db
-from src.db.models import User, Note
-from src.auth.dependencies import get_current_user
-from src.utils.logger import setup_logger
-from src.utils.config import settings
-# --- استدعاء أدوات المعالجة (النسخة الجديدة) ---
 from src.api.downloader import YouTubeDownloader
 from src.summarization.note_generator import NoteGenerator
 logger = setup_logger(__name__)
-# تم إزالة الـ prefix لضمان عمل الرابط /generate مباشرة
 router = APIRouter(tags=["Notes"])
-# مخزن المهام المؤقت في الذاكرة
 tasks: Dict[str, Dict] = {}
-# ==========================================
-# ⏱️ YouTube Duration Scraper (robust multi-strategy waterfall)
-# ==========================================
 def _extract_video_id(url: str) -> str:
     """Extract the 11-character YouTube video ID from any URL format."""
-    m = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
-    return m.group(1) if m else ""
-def _duration_via_ytdlp(url: str) -> int:
     """
-    Strategy 1 — yt-dlp (most reliable).
-    Uses yt-dlp's Python API to extract metadata without downloading.
-    Handles cookies, consent pages, anti-bot, geo-restrictions, etc.
     """
     try:
         import yt_dlp
         ydl_opts = {
             "quiet": True,
             "no_warnings": True,
-            "skip_download": True,          # ← metadata only, zero bandwidth
             "extract_flat": False,
             "socket_timeout": 20,
         }
@@ -59,38 +97,40 @@ def _duration_via_ytdlp(url: str) -> int:
             info = ydl.extract_info(url, download=False)
             duration = info.get("duration")
             if duration and int(duration) > 0:
-                logger.info("⏱️ [S1-ytdlp] duration=%ds", int(duration))
                 return int(duration)
-    except Exception as e:
-        logger.warning("⚠️ [S1-ytdlp] failed: %s", e)
     return 0
 def _duration_via_supadata(video_id: str) -> int:
     """
-    Strategy 2 — Supadata transcript API.
-    If the SUPADATA_API_KEY is set, query their API for the transcript;
-    the last segment's offset gives a close approximation of the duration.
     """
     api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
     if not api_key:
         return 0
     try:
         api_url = (
             f"https://api.supadata.ai/v1/youtube/transcript"
             f"?url=https://www.youtube.com/watch?v={video_id}"
         )
-        req = urllib.request.Request(api_url, headers={
-            "x-api-key": api_key,
-            "User-Agent": (
-                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) "
-                "Chrome/124.0.0.0 Safari/537.36"
-            ),
-        })
         with urllib.request.urlopen(req, timeout=20) as resp:
             data = json.loads(resp.read())
-            # Supadata returns segments with "offset" in ms — last one ≈ total duration
             segments = data.get("segments") or data.get("content", [])
             if isinstance(segments, list) and segments:
                 last = segments[-1]
@@ -98,20 +138,16 @@ def _duration_via_supadata(video_id: str) -> int:
                 dur_ms = last.get("duration", 0) or last.get("dur", 0)
                 total_s = (int(offset_ms) + int(dur_ms)) // 1000
                 if total_s > 0:
-                    logger.info("⏱️ [S2-supadata] duration≈%ds", total_s)
                     return total_s
-    except Exception as e:
-        logger.warning("⚠️ [S2-supadata] failed: %s", e)
     return 0
 def _duration_via_html_scrape(url: str) -> int:
-    """
-    Strategy 3 — raw HTML regex scraping (original approach, last resort).
-    Scrapes the YouTube page with a browser-like User-Agent and parses
-    lengthSeconds / approxDurationMs / ytInitialPlayerResponse JSON.
-    """
-    _HEADERS = {
         "User-Agent": (
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
             "AppleWebKit/537.36 (KHTML, like Gecko) "
@@ -128,98 +164,94 @@ def _duration_via_html_scrape(url: str) -> int:
     }
     try:
-        req = urllib.request.Request(url, headers=_HEADERS)
         with urllib.request.urlopen(req, timeout=15) as resp:
             html = resp.read().decode("utf-8", errors="ignore")
-    except Exception as e:
-        logger.warning("⚠️ [S3-scrape] HTTP fetch failed: %s", e)
         return 0
-    # 3a: "lengthSeconds":"3661"
-    m = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
-    if m:
-        duration = int(m.group(1))
-        logger.info("⏱️ [S3a-regex-quoted] duration=%ds", duration)
         return duration
-    # 3b: "approxDurationMs":"3661000" → ÷1000
-    m = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
-    if m:
-        duration = int(m.group(1)) // 1000
-        logger.info("⏱️ [S3b-approxMs] duration=%ds", duration)
         return duration
-    # 3c: full JSON parse of ytInitialPlayerResponse
-    m = re.search(
         r"var\s+ytInitialPlayerResponse\s*=\s*(\{.*?\})\s*;",
         html,
         re.DOTALL,
     )
-    if m:
         try:
-            data = json.loads(m.group(1))
             seconds_str = data.get("videoDetails", {}).get("lengthSeconds", "")
             if seconds_str and str(seconds_str).isdigit():
                 duration = int(seconds_str)
-                logger.info("⏱️ [S3c-jsonParse] duration=%ds", duration)
                 return duration
-        except (json.JSONDecodeError, AttributeError) as je:
-            logger.warning("⚠️ [S3c-jsonParse] JSON decode failed: %s", je)
     return 0
-def get_youtube_duration(url: str) -> int:
     """
-    Robustly fetches the YouTube video duration in seconds.
-    Strategy waterfall (tries each in order, stops on first success):
-      1. yt-dlp Python API   — handles anti-bot, cookies, consent walls
-      2. Supadata API         — if SUPADATA_API_KEY is set
-      3. Raw HTML regex       — fast but fragile, kept as last resort
-    Returns the duration as an integer (seconds), or 0 on total failure.
     """
     video_id = _extract_video_id(url)
-    # ── Strategy 1: yt-dlp (most reliable) ──
-    dur = _duration_via_ytdlp(url)
-    if dur > 0:
-        return dur
-    # ── Strategy 2: Supadata API ──
     if video_id:
-        dur = _duration_via_supadata(video_id)
-        if dur > 0:
-            return dur
-    # ── Strategy 3: HTML regex scraping (fallback) ──
-    dur = _duration_via_html_scrape(url)
-    if dur > 0:
-        return dur
-    logger.warning("⚠️ [duration] All strategies exhausted for: %s", url)
     return 0
-# --- Models ---
 class GenerateNotesRequest(BaseModel):
     youtube_url: HttpUrl
     language: str = "en"
 class TaskResponse(BaseModel):
     task_id: str
     status: str
     message: str
 class GeneratedNoteFile(BaseModel):
     filename: str
     title: str
     created_at: float
     size: int
-# ==========================================
-# 🚀 محرك توليد الملاحظات (Generate Engine)
-# ==========================================
 @router.post("/generate", response_model=TaskResponse)
 async def generate_note(
@@ -243,45 +275,40 @@ async def generate_note(
         task_id,
         str(request.youtube_url),
         request.language,
-        user_id
     )
     return TaskResponse(
         task_id=task_id,
         status="pending",
-        message="Generation started successfully."
     )
 @router.get("/status/{task_id}")
 async def get_task_status(task_id: str):
     if task_id not in tasks:
         raise HTTPException(status_code=404, detail="Task not found")
     return tasks[task_id]
-# ==========================================
-# 🛠️ دالة المعالجة الموحدة (Simplified Logic)
-# ==========================================
 async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
     downloader = YouTubeDownloader()
     try:
-        # Extract video ID for thumbnail
-        video_id_match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", youtube_url)
-        video_id = video_id_match.group(1) if video_id_match else ""
         video_title = "YouTube Video"
-        try:
-            import yt_dlp
-            ydl_opts = {"quiet": True, "no_warnings": True, "skip_download": True, "socket_timeout": 10}
-            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                info = ydl.extract_info(youtube_url, download=False)
-                if info and info.get("title"):
-                    video_title = info["title"]
-                    logger.info(f"✅ Fetched real video title: {video_title}")
-        except Exception as e:
-            logger.warning(f"⚠️ Failed to fetch video title, using fallback: {e}")
-        # Step 1: Generate the summary and key points (keep in memory)
         tasks[task_id]["status"] = "transcribing"
         tasks[task_id]["message"] = "Processing transcript through optimized pipeline..."
         transcript_text = downloader.get_transcript(youtube_url)
@@ -289,7 +316,11 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
         tasks[task_id]["status"] = "generating_notes"
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
-        video_duration = get_youtube_duration(youtube_url)
         final_markdown = note_gen.format_final_notes(
             note_gen.format_notes_to_markdown(summary_json),
@@ -300,61 +331,52 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
         )
         segments = summary_json.get("segments", [])
-        key_points_list = []
-        for seg in segments:
-            if isinstance(seg, dict) and seg.get("key_insight"):
-                key_points_list.append(seg["key_insight"])
-        # Step 2: Generate the categories (keep in memory)
         from src.summarization.topic_classifier import classify_topics
         raw_topics = summary_json.get("topics", [])
         categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
-        # Step 3: Combine everything into ONE single dictionary
-        note_data = {
-            "userId": user_id,
-            "videoUrl": youtube_url,
-            "videoTitle": video_title,
-            "notes": final_markdown,
-            "thumbnail": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else "",
-            "category": categories,
-            "keyPoints": key_points_list,
-            "createdAt": datetime.utcnow(),
-            "updatedAt": datetime.utcnow(),
-            "isFavorite": False,
-        }
-        # Step 4: Return data in memory without saving to DB yet
-        # The Flutter app will perform the final save after user edits.
         tasks[task_id]["status"] = "completed"
         tasks[task_id]["notes"] = final_markdown
         tasks[task_id]["topics"] = categories
         tasks[task_id]["category"] = categories
         tasks[task_id]["keyPoints"] = key_points_list
-        logger.info(f"✅ Task {task_id} completed successfully!")
-    except Exception as e:
-        logger.error(f"❌ Task {task_id} failed: {e}")
         tasks[task_id]["status"] = "failed"
-        tasks[task_id]["message"] = str(e)
-# ==========================================
-# 📂 إدارة الملفات (File Management)
-# ==========================================
 @router.get("/generated", response_model=List[GeneratedNoteFile])
 async def list_generated_notes():
     notes = []
     output_dir = settings.output_dir
-    if not output_dir.exists(): return []
     for file_path in output_dir.glob("*_notes.md"):
         stats = file_path.stat()
-        notes.append(GeneratedNoteFile(
-            filename=file_path.name,
-            title=file_path.name.replace("_notes.md", ""),
-            created_at=stats.st_mtime,
-            size=stats.st_size,
-        ))
-    notes.sort(key=lambda x: x.created_at, reverse=True)
-    return notes

 import json
+import os
+import re
 import urllib.request
+import uuid
 from datetime import datetime
+from typing import Dict, List
+from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
+from pydantic import BaseModel, HttpUrl
 from src.api.downloader import YouTubeDownloader
+from src.auth.dependencies import get_current_user
+from src.db.models import User
 from src.summarization.note_generator import NoteGenerator
+from src.utils.config import settings
+from src.utils.logger import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(tags=["Notes"])
 tasks: Dict[str, Dict] = {}
 def _extract_video_id(url: str) -> str:
     """Extract the 11-character YouTube video ID from any URL format."""
+    match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
+    return match.group(1) if match else ""
+def _is_fast_fail_ssl_error(exc: Exception) -> bool:
+    error_text = str(exc)
+    return any(
+        marker in error_text
+        for marker in (
+            "UNEXPECTED_EOF_WHILE_READING",
+            "SSLEOFError",
+            "EOF occurred in violation of protocol",
+        )
+    )
+def _fetch_video_info_via_ytdlp(url: str) -> Dict[str, object]:
     """
+    Fetch title and duration once so the request does not pay for repeated
+    yt-dlp failures when YouTube is temporarily unreachable.
     """
+    result: Dict[str, object] = {
+        "attempted": True,
+        "ok": False,
+        "title": "",
+        "duration": 0,
+        "ssl_failed": False,
+    }
+    try:
+        import yt_dlp
+        ydl_opts = {
+            "quiet": True,
+            "no_warnings": True,
+            "skip_download": True,
+            "extract_flat": False,
+            "socket_timeout": 8,
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False) or {}
+        result["ok"] = True
+        result["title"] = info.get("title") or ""
+        result["duration"] = int(info.get("duration") or 0)
+        if result["title"]:
+            logger.info("Fetched real video title: %s", result["title"])
+        if result["duration"]:
+            logger.info("Reusing yt-dlp duration from metadata fetch: %ss", result["duration"])
+    except Exception as exc:
+        result["ssl_failed"] = _is_fast_fail_ssl_error(exc)
+        logger.warning("Failed to fetch video metadata via yt-dlp: %s", exc)
+    return result
+def _duration_via_ytdlp(url: str) -> int:
+    """Strategy 1: use yt-dlp metadata without downloading the video."""
     try:
         import yt_dlp
         ydl_opts = {
             "quiet": True,
             "no_warnings": True,
+            "skip_download": True,
             "extract_flat": False,
             "socket_timeout": 20,
         }
             info = ydl.extract_info(url, download=False)
             duration = info.get("duration")
             if duration and int(duration) > 0:
+                logger.info("[S1-ytdlp] duration=%ds", int(duration))
                 return int(duration)
+    except Exception as exc:
+        logger.warning("[S1-ytdlp] failed: %s", exc)
     return 0
 def _duration_via_supadata(video_id: str) -> int:
     """
+    Strategy 2: use Supadata transcript segments and estimate duration from the
+    last segment timestamp.
     """
     api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
     if not api_key:
         return 0
     try:
         api_url = (
             f"https://api.supadata.ai/v1/youtube/transcript"
             f"?url=https://www.youtube.com/watch?v={video_id}"
         )
+        req = urllib.request.Request(
+            api_url,
+            headers={
+                "x-api-key": api_key,
+                "User-Agent": (
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/124.0.0.0 Safari/537.36"
+                ),
+            },
+        )
         with urllib.request.urlopen(req, timeout=20) as resp:
             data = json.loads(resp.read())
             segments = data.get("segments") or data.get("content", [])
             if isinstance(segments, list) and segments:
                 last = segments[-1]
                 dur_ms = last.get("duration", 0) or last.get("dur", 0)
                 total_s = (int(offset_ms) + int(dur_ms)) // 1000
                 if total_s > 0:
+                    logger.info("[S2-supadata] duration~%ds", total_s)
                     return total_s
+    except Exception as exc:
+        logger.warning("[S2-supadata] failed: %s", exc)
     return 0
 def _duration_via_html_scrape(url: str) -> int:
+    """Strategy 3: scrape the watch page and parse duration hints."""
+    headers = {
         "User-Agent": (
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
             "AppleWebKit/537.36 (KHTML, like Gecko) "
     }
     try:
+        req = urllib.request.Request(url, headers=headers)
         with urllib.request.urlopen(req, timeout=15) as resp:
             html = resp.read().decode("utf-8", errors="ignore")
+    except Exception as exc:
+        logger.warning("[S3-scrape] HTTP fetch failed: %s", exc)
         return 0
+    match = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
+    if match:
+        duration = int(match.group(1))
+        logger.info("[S3a-regex-quoted] duration=%ds", duration)
         return duration
+    match = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
+    if match:
+        duration = int(match.group(1)) // 1000
+        logger.info("[S3b-approxMs] duration=%ds", duration)
         return duration
+    match = re.search(
         r"var\s+ytInitialPlayerResponse\s*=\s*(\{.*?\})\s*;",
         html,
         re.DOTALL,
     )
+    if match:
         try:
+            data = json.loads(match.group(1))
             seconds_str = data.get("videoDetails", {}).get("lengthSeconds", "")
             if seconds_str and str(seconds_str).isdigit():
                 duration = int(seconds_str)
+                logger.info("[S3c-jsonParse] duration=%ds", duration)
                 return duration
+        except (json.JSONDecodeError, AttributeError) as exc:
+            logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
     return 0
+def get_youtube_duration(
+    url: str,
+    preferred_duration: int = 0,
+    skip_ytdlp: bool = False,
+) -> int:
     """
+    Robustly fetch the YouTube video duration in seconds using a waterfall.
     """
     video_id = _extract_video_id(url)
+    if preferred_duration > 0:
+        return preferred_duration
+    if not skip_ytdlp:
+        duration = _duration_via_ytdlp(url)
+        if duration > 0:
+            return duration
+    else:
+        logger.info("Skipping yt-dlp duration lookup because metadata fetch already failed earlier in this request.")
     if video_id:
+        duration = _duration_via_supadata(video_id)
+        if duration > 0:
+            return duration
+    duration = _duration_via_html_scrape(url)
+    if duration > 0:
+        return duration
+    logger.warning("[duration] All strategies exhausted for: %s", url)
     return 0
 class GenerateNotesRequest(BaseModel):
     youtube_url: HttpUrl
     language: str = "en"
 class TaskResponse(BaseModel):
     task_id: str
     status: str
     message: str
 class GeneratedNoteFile(BaseModel):
     filename: str
     title: str
     created_at: float
     size: int
 @router.post("/generate", response_model=TaskResponse)
 async def generate_note(
         task_id,
         str(request.youtube_url),
         request.language,
+        user_id,
     )
     return TaskResponse(
         task_id=task_id,
         status="pending",
+        message="Generation started successfully.",
     )
 @router.get("/status/{task_id}")
 async def get_task_status(task_id: str):
     if task_id not in tasks:
         raise HTTPException(status_code=404, detail="Task not found")
     return tasks[task_id]
 async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
     downloader = YouTubeDownloader()
     try:
+        video_id = _extract_video_id(youtube_url)
         video_title = "YouTube Video"
+        prefetched_duration = 0
+        skip_ytdlp_duration = False
+        video_info = _fetch_video_info_via_ytdlp(youtube_url)
+        if video_info["ok"]:
+            if video_info["title"]:
+                video_title = str(video_info["title"])
+            prefetched_duration = int(video_info["duration"])
+        else:
+            skip_ytdlp_duration = bool(video_info["attempted"] and not video_info["ok"])
         tasks[task_id]["status"] = "transcribing"
         tasks[task_id]["message"] = "Processing transcript through optimized pipeline..."
         transcript_text = downloader.get_transcript(youtube_url)
         tasks[task_id]["status"] = "generating_notes"
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
+        video_duration = get_youtube_duration(
+            youtube_url,
+            preferred_duration=prefetched_duration,
+            skip_ytdlp=skip_ytdlp_duration,
+        )
         final_markdown = note_gen.format_final_notes(
             note_gen.format_notes_to_markdown(summary_json),
         )
         segments = summary_json.get("segments", [])
+        key_points_list = [
+            seg["key_insight"]
+            for seg in segments
+            if isinstance(seg, dict) and seg.get("key_insight")
+        ]
         from src.summarization.topic_classifier import classify_topics
         raw_topics = summary_json.get("topics", [])
         categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
         tasks[task_id]["status"] = "completed"
+        tasks[task_id]["message"] = "Generation completed successfully."
         tasks[task_id]["notes"] = final_markdown
         tasks[task_id]["topics"] = categories
         tasks[task_id]["category"] = categories
         tasks[task_id]["keyPoints"] = key_points_list
+        tasks[task_id]["videoTitle"] = video_title
+        tasks[task_id]["thumbnail"] = (
+            f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
+        )
+        logger.info("Task %s completed successfully", task_id)
+    except Exception as exc:
+        logger.error("Task %s failed: %s", task_id, exc)
         tasks[task_id]["status"] = "failed"
+        tasks[task_id]["message"] = str(exc)
 @router.get("/generated", response_model=List[GeneratedNoteFile])
 async def list_generated_notes():
     notes = []
     output_dir = settings.output_dir
+    if not output_dir.exists():
+        return []
     for file_path in output_dir.glob("*_notes.md"):
         stats = file_path.stat()
+        notes.append(
+            GeneratedNoteFile(
+                filename=file_path.name,
+                title=file_path.name.replace("_notes.md", ""),
+                created_at=stats.st_mtime,
+                size=stats.st_size,
+            )
+        )
+    notes.sort(key=lambda item: item.created_at, reverse=True)
+    return notes