Ahmed Mostafa commited on
Commit
4b5adfb
·
1 Parent(s): 629e216

feat:fix "failed title fetch via yt-dlp, failed transcript retries, failed duration fetch via yt-dlp"

Browse files
src/api/__pycache__/notes_routes.cpython-312.pyc CHANGED
Binary files a/src/api/__pycache__/notes_routes.cpython-312.pyc and b/src/api/__pycache__/notes_routes.cpython-312.pyc differ
 
src/api/downloader.py CHANGED
@@ -1,97 +1,100 @@
 
1
  import logging
2
  import os
3
  import re
4
  import time
5
  import urllib.request
6
- import json
7
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
- # Browser-like headers to reduce YouTube SSL/rate-limit rejections
12
- _YT_HEADERS = {
13
- "User-Agent": (
14
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
- "AppleWebKit/537.36 (KHTML, like Gecko) "
16
- "Chrome/124.0.0.0 Safari/537.36"
17
- ),
18
- "Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
19
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
20
- }
 
21
 
22
  class YouTubeDownloader:
23
  def __init__(self):
24
- # سحب المفاتيح من الـ Environment
25
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
26
- self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
27
 
28
  def get_transcript(self, url: str) -> str:
29
  video_id = self._extract_video_id(url)
30
- logger.info(f"🔍 Pipeline for video ID: {video_id}")
31
 
32
- # 1. الخطة أ: YouTube Transcript API (لو فيه ترجمة جاهزة)
33
- # Retry up to 3 times with a short back-off to survive transient SSL hiccups
34
- for _attempt in range(3):
35
  try:
36
  from youtube_transcript_api import YouTubeTranscriptApi
 
37
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
38
- # Prefer manual captions; fall back to auto-generated in any language
39
  try:
40
  transcript = transcript_list.find_manually_created_transcript(["en", "ar", "en-US"])
41
  except Exception:
42
  try:
43
  transcript = transcript_list.find_generated_transcript(["en", "ar", "en-US"])
44
  except Exception:
45
- # Accept absolutely any available transcript
46
  transcript = next(iter(transcript_list))
 
47
  entries = transcript.fetch()
48
- logger.info(f"Plan A Success! (attempt {_attempt + 1})")
49
- return " ".join(e["text"] for e in entries).strip()
50
- except Exception as e:
51
- logger.warning(f"⚠️ Plan A attempt {_attempt + 1} failed: {e}")
52
- if _attempt < 2:
53
- time.sleep(1.5 * (_attempt + 1)) # 1.5 s, 3 s back-off
54
-
55
- # 2. الخطة ب: Supadata (المنقذ الأول - بإصلاح الـ User-Agent)
 
 
 
56
  if self._supadata_key:
57
  try:
58
- logger.info("🚀 Plan B: Calling Supadata...")
59
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
60
-
61
- # إضافة Headers عشان نهرب من الـ 403 Forbidden
62
  headers = {
63
  "x-api-key": self._supadata_key,
64
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 
 
 
 
65
  }
66
-
67
  req = urllib.request.Request(
68
  f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
69
- headers=headers
70
  )
71
-
72
  with urllib.request.urlopen(req, timeout=30) as resp:
73
  data = json.loads(resp.read())
74
  text = data.get("content", "").strip()
75
  if text:
76
- logger.info("Plan B Success!")
77
  return text
78
- except Exception as e:
79
- logger.error(f"Plan B (Supadata) failed: {e}")
80
 
81
- # 3. الخطة ج: AssemblyAI raw REST (bypasses SDK serialization bug)
82
  if self._assemblyai_key:
83
  try:
84
  import httpx
85
  import yt_dlp
86
 
87
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
88
-
89
- ydl_opts = {'format': 'bestaudio', 'noplaylist': True, 'quiet': True}
90
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
91
  info_dict = ydl.extract_info(clean_url, download=False)
92
- direct_audio_url = info_dict.get('url')
93
-
94
- logger.info(f"Extracted direct audio URL: {str(direct_audio_url)[:50]}...")
95
 
96
  aai_headers = {
97
  "authorization": self._assemblyai_key,
@@ -99,17 +102,14 @@ class YouTubeDownloader:
99
  }
100
  payload = {
101
  "audio_url": direct_audio_url,
102
- "speech_models": ["universal-2"], # REQUIRED as a list by the API
103
  "language_detection": True,
104
  }
105
 
106
- logger.info(
107
- f"ATTEMPTING PLAN C WITH PAYLOAD: {json.dumps(payload)}"
108
- )
109
  print(f"[PLAN C] PAYLOAD SENT TO ASSEMBLYAI: {json.dumps(payload)}")
110
 
111
  with httpx.Client(timeout=30) as client:
112
- # Step 1 — submit the job
113
  resp = client.post(
114
  "https://api.assemblyai.com/v2/transcript",
115
  headers=aai_headers,
@@ -117,37 +117,32 @@ class YouTubeDownloader:
117
  )
118
  resp.raise_for_status()
119
  transcript_id = resp.json()["id"]
120
- logger.info(f"Plan C job submitted. ID: {transcript_id}")
121
  print(f"[PLAN C] Job ID: {transcript_id}")
122
 
123
- # Step 2 — poll until completed (max ~5 min)
124
  polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
125
- for _ in range(60): # 60 × 5 s = 5 min max
126
  time.sleep(5)
127
  poll = client.get(polling_url, headers=aai_headers)
128
  poll.raise_for_status()
129
  result = poll.json()
130
  status = result.get("status")
131
  if status == "completed":
132
- logger.info("Plan C Success!")
133
  return result["text"]
134
- elif status == "error":
135
- logger.warning(
136
- f"⚠️ Plan C API Error: {result.get('error')}"
137
- )
138
  break
139
- # status == "processing" or "queued" → keep polling
140
 
141
- except Exception as e:
142
- logger.error(f"Plan C (AssemblyAI REST) failed: {e}")
143
- print(f"[PLAN C] EXCEPTION: {e}")
144
 
145
- raise RuntimeError(f"All strategies exhausted for {video_id}. No transcript found.")
146
 
147
  def _extract_video_id(self, url: str) -> str:
148
- # يدعم كل أنواع روابط يوتيوب
149
  match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
150
  return match.group(1) if match else "unknown"
151
 
152
  def cleanup(self, path=None):
153
- pass
 
1
+ import json
2
  import logging
3
  import os
4
  import re
5
  import time
6
  import urllib.request
 
7
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
+ _FAST_FAIL_SSL_MARKERS = (
12
+ "UNEXPECTED_EOF_WHILE_READING",
13
+ "SSLEOFError",
14
+ "EOF occurred in violation of protocol",
15
+ )
16
+
17
+
18
+ def _is_fast_fail_ssl_error(exc: Exception) -> bool:
19
+ error_text = str(exc)
20
+ return any(marker in error_text for marker in _FAST_FAIL_SSL_MARKERS)
21
+
22
 
23
  class YouTubeDownloader:
24
  def __init__(self):
 
25
  self._assemblyai_key = os.environ.get("ASSEMBLYAI_API_KEY", "").strip()
26
+ self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
27
 
28
  def get_transcript(self, url: str) -> str:
29
  video_id = self._extract_video_id(url)
30
+ logger.info("Pipeline for video ID: %s", video_id)
31
 
32
+ # Plan A: use YouTube Transcript API first, but avoid retrying known SSL EOF failures.
33
+ for attempt in range(3):
 
34
  try:
35
  from youtube_transcript_api import YouTubeTranscriptApi
36
+
37
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 
38
  try:
39
  transcript = transcript_list.find_manually_created_transcript(["en", "ar", "en-US"])
40
  except Exception:
41
  try:
42
  transcript = transcript_list.find_generated_transcript(["en", "ar", "en-US"])
43
  except Exception:
 
44
  transcript = next(iter(transcript_list))
45
+
46
  entries = transcript.fetch()
47
+ logger.info("Plan A success (attempt %s)", attempt + 1)
48
+ return " ".join(entry["text"] for entry in entries).strip()
49
+ except Exception as exc:
50
+ logger.warning("Plan A attempt %s failed: %s", attempt + 1, exc)
51
+ if _is_fast_fail_ssl_error(exc):
52
+ logger.info("Plan A fast-fail: SSL transport error detected, switching to fallback immediately.")
53
+ break
54
+ if attempt < 2:
55
+ time.sleep(1.5 * (attempt + 1))
56
+
57
+ # Plan B: Supadata transcript API.
58
  if self._supadata_key:
59
  try:
60
+ logger.info("Plan B: calling Supadata")
61
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
 
 
62
  headers = {
63
  "x-api-key": self._supadata_key,
64
+ "User-Agent": (
65
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
66
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
67
+ "Chrome/120.0.0.0 Safari/537.36"
68
+ ),
69
  }
 
70
  req = urllib.request.Request(
71
  f"https://api.supadata.ai/v1/youtube/transcript?url={clean_url}&text=true",
72
+ headers=headers,
73
  )
74
+
75
  with urllib.request.urlopen(req, timeout=30) as resp:
76
  data = json.loads(resp.read())
77
  text = data.get("content", "").strip()
78
  if text:
79
+ logger.info("Plan B success")
80
  return text
81
+ except Exception as exc:
82
+ logger.error("Plan B (Supadata) failed: %s", exc)
83
 
84
+ # Plan C: AssemblyAI raw REST fallback.
85
  if self._assemblyai_key:
86
  try:
87
  import httpx
88
  import yt_dlp
89
 
90
  clean_url = f"https://www.youtube.com/watch?v={video_id}"
91
+ ydl_opts = {"format": "bestaudio", "noplaylist": True, "quiet": True}
92
+
93
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
94
  info_dict = ydl.extract_info(clean_url, download=False)
95
+ direct_audio_url = info_dict.get("url")
96
+
97
+ logger.info("Extracted direct audio URL: %s...", str(direct_audio_url)[:50])
98
 
99
  aai_headers = {
100
  "authorization": self._assemblyai_key,
 
102
  }
103
  payload = {
104
  "audio_url": direct_audio_url,
105
+ "speech_models": ["universal-2"],
106
  "language_detection": True,
107
  }
108
 
109
+ logger.info("Attempting Plan C with payload: %s", json.dumps(payload))
 
 
110
  print(f"[PLAN C] PAYLOAD SENT TO ASSEMBLYAI: {json.dumps(payload)}")
111
 
112
  with httpx.Client(timeout=30) as client:
 
113
  resp = client.post(
114
  "https://api.assemblyai.com/v2/transcript",
115
  headers=aai_headers,
 
117
  )
118
  resp.raise_for_status()
119
  transcript_id = resp.json()["id"]
120
+ logger.info("Plan C job submitted. ID: %s", transcript_id)
121
  print(f"[PLAN C] Job ID: {transcript_id}")
122
 
 
123
  polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
124
+ for _ in range(60):
125
  time.sleep(5)
126
  poll = client.get(polling_url, headers=aai_headers)
127
  poll.raise_for_status()
128
  result = poll.json()
129
  status = result.get("status")
130
  if status == "completed":
131
+ logger.info("Plan C success")
132
  return result["text"]
133
+ if status == "error":
134
+ logger.warning("Plan C API error: %s", result.get("error"))
 
 
135
  break
 
136
 
137
+ except Exception as exc:
138
+ logger.error("Plan C (AssemblyAI REST) failed: %s", exc)
139
+ print(f"[PLAN C] EXCEPTION: {exc}")
140
 
141
+ raise RuntimeError(f"All strategies exhausted for {video_id}. No transcript found.")
142
 
143
  def _extract_video_id(self, url: str) -> str:
 
144
  match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
145
  return match.group(1) if match else "unknown"
146
 
147
  def cleanup(self, path=None):
148
+ pass
src/api/notes_routes.py CHANGED
@@ -1,57 +1,95 @@
1
- import uuid
2
- import re
3
- import logging
4
- import os
5
  import json
 
 
6
  import urllib.request
 
7
  from datetime import datetime
8
- from typing import List, Optional, Dict
9
- from pathlib import Path
10
 
11
- from fastapi import APIRouter, Depends, HTTPException, status, Query, BackgroundTasks
12
- from pydantic import BaseModel, HttpUrl, Field
13
 
14
- from src.db.firebase import get_firebase_db
15
- from src.db.models import User, Note
16
- from src.auth.dependencies import get_current_user
17
- from src.utils.logger import setup_logger
18
- from src.utils.config import settings
19
-
20
- # --- استدعاء أدوات المعالجة (النسخة الجديدة) ---
21
  from src.api.downloader import YouTubeDownloader
 
 
22
  from src.summarization.note_generator import NoteGenerator
 
 
23
 
24
  logger = setup_logger(__name__)
25
- # تم إزالة الـ prefix لضمان عمل الرابط /generate مباشرة
26
  router = APIRouter(tags=["Notes"])
27
 
28
- # مخزن المهام المؤقت في الذاكرة
29
  tasks: Dict[str, Dict] = {}
30
 
31
 
32
- # ==========================================
33
- # ⏱️ YouTube Duration Scraper (robust multi-strategy waterfall)
34
- # ==========================================
35
-
36
  def _extract_video_id(url: str) -> str:
37
  """Extract the 11-character YouTube video ID from any URL format."""
38
- m = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
39
- return m.group(1) if m else ""
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
- def _duration_via_ytdlp(url: str) -> int:
43
  """
44
- Strategy 1 yt-dlp (most reliable).
45
- Uses yt-dlp's Python API to extract metadata without downloading.
46
- Handles cookies, consent pages, anti-bot, geo-restrictions, etc.
47
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
  import yt_dlp
50
 
51
  ydl_opts = {
52
  "quiet": True,
53
  "no_warnings": True,
54
- "skip_download": True, # ← metadata only, zero bandwidth
55
  "extract_flat": False,
56
  "socket_timeout": 20,
57
  }
@@ -59,38 +97,40 @@ def _duration_via_ytdlp(url: str) -> int:
59
  info = ydl.extract_info(url, download=False)
60
  duration = info.get("duration")
61
  if duration and int(duration) > 0:
62
- logger.info("⏱️ [S1-ytdlp] duration=%ds", int(duration))
63
  return int(duration)
64
- except Exception as e:
65
- logger.warning("⚠️ [S1-ytdlp] failed: %s", e)
66
  return 0
67
 
68
 
69
  def _duration_via_supadata(video_id: str) -> int:
70
  """
71
- Strategy 2 Supadata transcript API.
72
- If the SUPADATA_API_KEY is set, query their API for the transcript;
73
- the last segment's offset gives a close approximation of the duration.
74
  """
75
  api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
76
  if not api_key:
77
  return 0
 
78
  try:
79
  api_url = (
80
  f"https://api.supadata.ai/v1/youtube/transcript"
81
  f"?url=https://www.youtube.com/watch?v={video_id}"
82
  )
83
- req = urllib.request.Request(api_url, headers={
84
- "x-api-key": api_key,
85
- "User-Agent": (
86
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
87
- "AppleWebKit/537.36 (KHTML, like Gecko) "
88
- "Chrome/124.0.0.0 Safari/537.36"
89
- ),
90
- })
 
 
 
91
  with urllib.request.urlopen(req, timeout=20) as resp:
92
  data = json.loads(resp.read())
93
- # Supadata returns segments with "offset" in ms — last one ≈ total duration
94
  segments = data.get("segments") or data.get("content", [])
95
  if isinstance(segments, list) and segments:
96
  last = segments[-1]
@@ -98,20 +138,16 @@ def _duration_via_supadata(video_id: str) -> int:
98
  dur_ms = last.get("duration", 0) or last.get("dur", 0)
99
  total_s = (int(offset_ms) + int(dur_ms)) // 1000
100
  if total_s > 0:
101
- logger.info("⏱️ [S2-supadata] duration%ds", total_s)
102
  return total_s
103
- except Exception as e:
104
- logger.warning("⚠️ [S2-supadata] failed: %s", e)
105
  return 0
106
 
107
 
108
  def _duration_via_html_scrape(url: str) -> int:
109
- """
110
- Strategy 3 — raw HTML regex scraping (original approach, last resort).
111
- Scrapes the YouTube page with a browser-like User-Agent and parses
112
- lengthSeconds / approxDurationMs / ytInitialPlayerResponse JSON.
113
- """
114
- _HEADERS = {
115
  "User-Agent": (
116
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
117
  "AppleWebKit/537.36 (KHTML, like Gecko) "
@@ -128,98 +164,94 @@ def _duration_via_html_scrape(url: str) -> int:
128
  }
129
 
130
  try:
131
- req = urllib.request.Request(url, headers=_HEADERS)
132
  with urllib.request.urlopen(req, timeout=15) as resp:
133
  html = resp.read().decode("utf-8", errors="ignore")
134
- except Exception as e:
135
- logger.warning("⚠️ [S3-scrape] HTTP fetch failed: %s", e)
136
  return 0
137
 
138
- # 3a: "lengthSeconds":"3661"
139
- m = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
140
- if m:
141
- duration = int(m.group(1))
142
- logger.info("⏱️ [S3a-regex-quoted] duration=%ds", duration)
143
  return duration
144
 
145
- # 3b: "approxDurationMs":"3661000" → ÷1000
146
- m = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
147
- if m:
148
- duration = int(m.group(1)) // 1000
149
- logger.info("⏱️ [S3b-approxMs] duration=%ds", duration)
150
  return duration
151
 
152
- # 3c: full JSON parse of ytInitialPlayerResponse
153
- m = re.search(
154
  r"var\s+ytInitialPlayerResponse\s*=\s*(\{.*?\})\s*;",
155
  html,
156
  re.DOTALL,
157
  )
158
- if m:
159
  try:
160
- data = json.loads(m.group(1))
161
  seconds_str = data.get("videoDetails", {}).get("lengthSeconds", "")
162
  if seconds_str and str(seconds_str).isdigit():
163
  duration = int(seconds_str)
164
- logger.info("⏱️ [S3c-jsonParse] duration=%ds", duration)
165
  return duration
166
- except (json.JSONDecodeError, AttributeError) as je:
167
- logger.warning("⚠️ [S3c-jsonParse] JSON decode failed: %s", je)
168
 
169
  return 0
170
 
171
 
172
- def get_youtube_duration(url: str) -> int:
 
 
 
 
173
  """
174
- Robustly fetches the YouTube video duration in seconds.
175
-
176
- Strategy waterfall (tries each in order, stops on first success):
177
- 1. yt-dlp Python API — handles anti-bot, cookies, consent walls
178
- 2. Supadata API — if SUPADATA_API_KEY is set
179
- 3. Raw HTML regex — fast but fragile, kept as last resort
180
-
181
- Returns the duration as an integer (seconds), or 0 on total failure.
182
  """
183
  video_id = _extract_video_id(url)
184
 
185
- # ── Strategy 1: yt-dlp (most reliable) ──
186
- dur = _duration_via_ytdlp(url)
187
- if dur > 0:
188
- return dur
 
 
 
 
 
189
 
190
- # ── Strategy 2: Supadata API ──
191
  if video_id:
192
- dur = _duration_via_supadata(video_id)
193
- if dur > 0:
194
- return dur
195
 
196
- # ── Strategy 3: HTML regex scraping (fallback) ──
197
- dur = _duration_via_html_scrape(url)
198
- if dur > 0:
199
- return dur
200
 
201
- logger.warning("⚠️ [duration] All strategies exhausted for: %s", url)
202
  return 0
203
 
204
- # --- Models ---
205
  class GenerateNotesRequest(BaseModel):
206
  youtube_url: HttpUrl
207
  language: str = "en"
208
 
 
209
  class TaskResponse(BaseModel):
210
  task_id: str
211
  status: str
212
  message: str
213
 
 
214
  class GeneratedNoteFile(BaseModel):
215
  filename: str
216
  title: str
217
  created_at: float
218
  size: int
219
 
220
- # ==========================================
221
- # 🚀 محرك توليد الملاحظات (Generate Engine)
222
- # ==========================================
223
 
224
  @router.post("/generate", response_model=TaskResponse)
225
  async def generate_note(
@@ -243,45 +275,40 @@ async def generate_note(
243
  task_id,
244
  str(request.youtube_url),
245
  request.language,
246
- user_id
247
  )
248
 
249
  return TaskResponse(
250
  task_id=task_id,
251
  status="pending",
252
- message="Generation started successfully."
253
  )
254
 
 
255
  @router.get("/status/{task_id}")
256
  async def get_task_status(task_id: str):
257
  if task_id not in tasks:
258
  raise HTTPException(status_code=404, detail="Task not found")
259
  return tasks[task_id]
260
 
261
- # ==========================================
262
- # 🛠️ دالة المعالجة الموحدة (Simplified Logic)
263
- # ==========================================
264
 
265
  async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
266
  downloader = YouTubeDownloader()
267
-
268
  try:
269
- # Extract video ID for thumbnail
270
- video_id_match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", youtube_url)
271
- video_id = video_id_match.group(1) if video_id_match else ""
272
  video_title = "YouTube Video"
273
- try:
274
- import yt_dlp
275
- ydl_opts = {"quiet": True, "no_warnings": True, "skip_download": True, "socket_timeout": 10}
276
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
277
- info = ydl.extract_info(youtube_url, download=False)
278
- if info and info.get("title"):
279
- video_title = info["title"]
280
- logger.info(f"✅ Fetched real video title: {video_title}")
281
- except Exception as e:
282
- logger.warning(f"⚠️ Failed to fetch video title, using fallback: {e}")
283
-
284
- # Step 1: Generate the summary and key points (keep in memory)
285
  tasks[task_id]["status"] = "transcribing"
286
  tasks[task_id]["message"] = "Processing transcript through optimized pipeline..."
287
  transcript_text = downloader.get_transcript(youtube_url)
@@ -289,7 +316,11 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
289
  tasks[task_id]["status"] = "generating_notes"
290
  note_gen = NoteGenerator()
291
  summary_json = note_gen.generateSummary(transcript_text, video_title)
292
- video_duration = get_youtube_duration(youtube_url)
 
 
 
 
293
 
294
  final_markdown = note_gen.format_final_notes(
295
  note_gen.format_notes_to_markdown(summary_json),
@@ -300,61 +331,52 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
300
  )
301
 
302
  segments = summary_json.get("segments", [])
303
- key_points_list = []
304
- for seg in segments:
305
- if isinstance(seg, dict) and seg.get("key_insight"):
306
- key_points_list.append(seg["key_insight"])
 
307
 
308
- # Step 2: Generate the categories (keep in memory)
309
  from src.summarization.topic_classifier import classify_topics
 
310
  raw_topics = summary_json.get("topics", [])
311
  categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
312
 
313
- # Step 3: Combine everything into ONE single dictionary
314
- note_data = {
315
- "userId": user_id,
316
- "videoUrl": youtube_url,
317
- "videoTitle": video_title,
318
- "notes": final_markdown,
319
- "thumbnail": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else "",
320
- "category": categories,
321
- "keyPoints": key_points_list,
322
- "createdAt": datetime.utcnow(),
323
- "updatedAt": datetime.utcnow(),
324
- "isFavorite": False,
325
- }
326
-
327
- # Step 4: Return data in memory without saving to DB yet
328
- # The Flutter app will perform the final save after user edits.
329
  tasks[task_id]["status"] = "completed"
 
330
  tasks[task_id]["notes"] = final_markdown
331
  tasks[task_id]["topics"] = categories
332
  tasks[task_id]["category"] = categories
333
  tasks[task_id]["keyPoints"] = key_points_list
334
- logger.info(f" Task {task_id} completed successfully!")
 
 
 
 
335
 
336
- except Exception as e:
337
- logger.error(f"Task {task_id} failed: {e}")
338
  tasks[task_id]["status"] = "failed"
339
- tasks[task_id]["message"] = str(e)
340
 
341
- # ==========================================
342
- # 📂 إدارة الملفات (File Management)
343
- # ==========================================
344
 
345
  @router.get("/generated", response_model=List[GeneratedNoteFile])
346
  async def list_generated_notes():
347
  notes = []
348
  output_dir = settings.output_dir
349
- if not output_dir.exists(): return []
 
350
 
351
  for file_path in output_dir.glob("*_notes.md"):
352
  stats = file_path.stat()
353
- notes.append(GeneratedNoteFile(
354
- filename=file_path.name,
355
- title=file_path.name.replace("_notes.md", ""),
356
- created_at=stats.st_mtime,
357
- size=stats.st_size,
358
- ))
359
- notes.sort(key=lambda x: x.created_at, reverse=True)
360
- return notes
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
+ import re
4
  import urllib.request
5
+ import uuid
6
  from datetime import datetime
7
+ from typing import Dict, List
 
8
 
9
+ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
10
+ from pydantic import BaseModel, HttpUrl
11
 
 
 
 
 
 
 
 
12
  from src.api.downloader import YouTubeDownloader
13
+ from src.auth.dependencies import get_current_user
14
+ from src.db.models import User
15
  from src.summarization.note_generator import NoteGenerator
16
+ from src.utils.config import settings
17
+ from src.utils.logger import setup_logger
18
 
19
  logger = setup_logger(__name__)
 
20
  router = APIRouter(tags=["Notes"])
21
 
 
22
  tasks: Dict[str, Dict] = {}
23
 
24
 
 
 
 
 
25
  def _extract_video_id(url: str) -> str:
26
  """Extract the 11-character YouTube video ID from any URL format."""
27
+ match = re.search(r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url))
28
+ return match.group(1) if match else ""
29
+
30
+
31
+ def _is_fast_fail_ssl_error(exc: Exception) -> bool:
32
+ error_text = str(exc)
33
+ return any(
34
+ marker in error_text
35
+ for marker in (
36
+ "UNEXPECTED_EOF_WHILE_READING",
37
+ "SSLEOFError",
38
+ "EOF occurred in violation of protocol",
39
+ )
40
+ )
41
 
42
 
43
+ def _fetch_video_info_via_ytdlp(url: str) -> Dict[str, object]:
44
  """
45
+ Fetch title and duration once so the request does not pay for repeated
46
+ yt-dlp failures when YouTube is temporarily unreachable.
 
47
  """
48
+ result: Dict[str, object] = {
49
+ "attempted": True,
50
+ "ok": False,
51
+ "title": "",
52
+ "duration": 0,
53
+ "ssl_failed": False,
54
+ }
55
+
56
+ try:
57
+ import yt_dlp
58
+
59
+ ydl_opts = {
60
+ "quiet": True,
61
+ "no_warnings": True,
62
+ "skip_download": True,
63
+ "extract_flat": False,
64
+ "socket_timeout": 8,
65
+ }
66
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
67
+ info = ydl.extract_info(url, download=False) or {}
68
+
69
+ result["ok"] = True
70
+ result["title"] = info.get("title") or ""
71
+ result["duration"] = int(info.get("duration") or 0)
72
+
73
+ if result["title"]:
74
+ logger.info("Fetched real video title: %s", result["title"])
75
+ if result["duration"]:
76
+ logger.info("Reusing yt-dlp duration from metadata fetch: %ss", result["duration"])
77
+ except Exception as exc:
78
+ result["ssl_failed"] = _is_fast_fail_ssl_error(exc)
79
+ logger.warning("Failed to fetch video metadata via yt-dlp: %s", exc)
80
+
81
+ return result
82
+
83
+
84
+ def _duration_via_ytdlp(url: str) -> int:
85
+ """Strategy 1: use yt-dlp metadata without downloading the video."""
86
  try:
87
  import yt_dlp
88
 
89
  ydl_opts = {
90
  "quiet": True,
91
  "no_warnings": True,
92
+ "skip_download": True,
93
  "extract_flat": False,
94
  "socket_timeout": 20,
95
  }
 
97
  info = ydl.extract_info(url, download=False)
98
  duration = info.get("duration")
99
  if duration and int(duration) > 0:
100
+ logger.info("[S1-ytdlp] duration=%ds", int(duration))
101
  return int(duration)
102
+ except Exception as exc:
103
+ logger.warning("[S1-ytdlp] failed: %s", exc)
104
  return 0
105
 
106
 
107
  def _duration_via_supadata(video_id: str) -> int:
108
  """
109
+ Strategy 2: use Supadata transcript segments and estimate duration from the
110
+ last segment timestamp.
 
111
  """
112
  api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
113
  if not api_key:
114
  return 0
115
+
116
  try:
117
  api_url = (
118
  f"https://api.supadata.ai/v1/youtube/transcript"
119
  f"?url=https://www.youtube.com/watch?v={video_id}"
120
  )
121
+ req = urllib.request.Request(
122
+ api_url,
123
+ headers={
124
+ "x-api-key": api_key,
125
+ "User-Agent": (
126
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
127
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
128
+ "Chrome/124.0.0.0 Safari/537.36"
129
+ ),
130
+ },
131
+ )
132
  with urllib.request.urlopen(req, timeout=20) as resp:
133
  data = json.loads(resp.read())
 
134
  segments = data.get("segments") or data.get("content", [])
135
  if isinstance(segments, list) and segments:
136
  last = segments[-1]
 
138
  dur_ms = last.get("duration", 0) or last.get("dur", 0)
139
  total_s = (int(offset_ms) + int(dur_ms)) // 1000
140
  if total_s > 0:
141
+ logger.info("[S2-supadata] duration~%ds", total_s)
142
  return total_s
143
+ except Exception as exc:
144
+ logger.warning("[S2-supadata] failed: %s", exc)
145
  return 0
146
 
147
 
148
  def _duration_via_html_scrape(url: str) -> int:
149
+ """Strategy 3: scrape the watch page and parse duration hints."""
150
+ headers = {
 
 
 
 
151
  "User-Agent": (
152
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
153
  "AppleWebKit/537.36 (KHTML, like Gecko) "
 
164
  }
165
 
166
  try:
167
+ req = urllib.request.Request(url, headers=headers)
168
  with urllib.request.urlopen(req, timeout=15) as resp:
169
  html = resp.read().decode("utf-8", errors="ignore")
170
+ except Exception as exc:
171
+ logger.warning("[S3-scrape] HTTP fetch failed: %s", exc)
172
  return 0
173
 
174
+ match = re.search(r'"lengthSeconds"\s*:\s*"(\d+)"', html)
175
+ if match:
176
+ duration = int(match.group(1))
177
+ logger.info("[S3a-regex-quoted] duration=%ds", duration)
 
178
  return duration
179
 
180
+ match = re.search(r'"approxDurationMs"\s*:\s*"(\d+)"', html)
181
+ if match:
182
+ duration = int(match.group(1)) // 1000
183
+ logger.info("[S3b-approxMs] duration=%ds", duration)
 
184
  return duration
185
 
186
+ match = re.search(
 
187
  r"var\s+ytInitialPlayerResponse\s*=\s*(\{.*?\})\s*;",
188
  html,
189
  re.DOTALL,
190
  )
191
+ if match:
192
  try:
193
+ data = json.loads(match.group(1))
194
  seconds_str = data.get("videoDetails", {}).get("lengthSeconds", "")
195
  if seconds_str and str(seconds_str).isdigit():
196
  duration = int(seconds_str)
197
+ logger.info("[S3c-jsonParse] duration=%ds", duration)
198
  return duration
199
+ except (json.JSONDecodeError, AttributeError) as exc:
200
+ logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
201
 
202
  return 0
203
 
204
 
205
+ def get_youtube_duration(
206
+ url: str,
207
+ preferred_duration: int = 0,
208
+ skip_ytdlp: bool = False,
209
+ ) -> int:
210
  """
211
+ Robustly fetch the YouTube video duration in seconds using a waterfall.
 
 
 
 
 
 
 
212
  """
213
  video_id = _extract_video_id(url)
214
 
215
+ if preferred_duration > 0:
216
+ return preferred_duration
217
+
218
+ if not skip_ytdlp:
219
+ duration = _duration_via_ytdlp(url)
220
+ if duration > 0:
221
+ return duration
222
+ else:
223
+ logger.info("Skipping yt-dlp duration lookup because metadata fetch already failed earlier in this request.")
224
 
 
225
  if video_id:
226
+ duration = _duration_via_supadata(video_id)
227
+ if duration > 0:
228
+ return duration
229
 
230
+ duration = _duration_via_html_scrape(url)
231
+ if duration > 0:
232
+ return duration
 
233
 
234
+ logger.warning("[duration] All strategies exhausted for: %s", url)
235
  return 0
236
 
237
+
238
  class GenerateNotesRequest(BaseModel):
239
  youtube_url: HttpUrl
240
  language: str = "en"
241
 
242
+
243
  class TaskResponse(BaseModel):
244
  task_id: str
245
  status: str
246
  message: str
247
 
248
+
249
  class GeneratedNoteFile(BaseModel):
250
  filename: str
251
  title: str
252
  created_at: float
253
  size: int
254
 
 
 
 
255
 
256
  @router.post("/generate", response_model=TaskResponse)
257
  async def generate_note(
 
275
  task_id,
276
  str(request.youtube_url),
277
  request.language,
278
+ user_id,
279
  )
280
 
281
  return TaskResponse(
282
  task_id=task_id,
283
  status="pending",
284
+ message="Generation started successfully.",
285
  )
286
 
287
+
288
  @router.get("/status/{task_id}")
289
  async def get_task_status(task_id: str):
290
  if task_id not in tasks:
291
  raise HTTPException(status_code=404, detail="Task not found")
292
  return tasks[task_id]
293
 
 
 
 
294
 
295
  async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
296
  downloader = YouTubeDownloader()
297
+
298
  try:
299
+ video_id = _extract_video_id(youtube_url)
 
 
300
  video_title = "YouTube Video"
301
+ prefetched_duration = 0
302
+ skip_ytdlp_duration = False
303
+
304
+ video_info = _fetch_video_info_via_ytdlp(youtube_url)
305
+ if video_info["ok"]:
306
+ if video_info["title"]:
307
+ video_title = str(video_info["title"])
308
+ prefetched_duration = int(video_info["duration"])
309
+ else:
310
+ skip_ytdlp_duration = bool(video_info["attempted"] and not video_info["ok"])
311
+
 
312
  tasks[task_id]["status"] = "transcribing"
313
  tasks[task_id]["message"] = "Processing transcript through optimized pipeline..."
314
  transcript_text = downloader.get_transcript(youtube_url)
 
316
  tasks[task_id]["status"] = "generating_notes"
317
  note_gen = NoteGenerator()
318
  summary_json = note_gen.generateSummary(transcript_text, video_title)
319
+ video_duration = get_youtube_duration(
320
+ youtube_url,
321
+ preferred_duration=prefetched_duration,
322
+ skip_ytdlp=skip_ytdlp_duration,
323
+ )
324
 
325
  final_markdown = note_gen.format_final_notes(
326
  note_gen.format_notes_to_markdown(summary_json),
 
331
  )
332
 
333
  segments = summary_json.get("segments", [])
334
+ key_points_list = [
335
+ seg["key_insight"]
336
+ for seg in segments
337
+ if isinstance(seg, dict) and seg.get("key_insight")
338
+ ]
339
 
 
340
  from src.summarization.topic_classifier import classify_topics
341
+
342
  raw_topics = summary_json.get("topics", [])
343
  categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  tasks[task_id]["status"] = "completed"
346
+ tasks[task_id]["message"] = "Generation completed successfully."
347
  tasks[task_id]["notes"] = final_markdown
348
  tasks[task_id]["topics"] = categories
349
  tasks[task_id]["category"] = categories
350
  tasks[task_id]["keyPoints"] = key_points_list
351
+ tasks[task_id]["videoTitle"] = video_title
352
+ tasks[task_id]["thumbnail"] = (
353
+ f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
354
+ )
355
+ logger.info("Task %s completed successfully", task_id)
356
 
357
+ except Exception as exc:
358
+ logger.error("Task %s failed: %s", task_id, exc)
359
  tasks[task_id]["status"] = "failed"
360
+ tasks[task_id]["message"] = str(exc)
361
 
 
 
 
362
 
363
  @router.get("/generated", response_model=List[GeneratedNoteFile])
364
  async def list_generated_notes():
365
  notes = []
366
  output_dir = settings.output_dir
367
+ if not output_dir.exists():
368
+ return []
369
 
370
  for file_path in output_dir.glob("*_notes.md"):
371
  stats = file_path.stat()
372
+ notes.append(
373
+ GeneratedNoteFile(
374
+ filename=file_path.name,
375
+ title=file_path.name.replace("_notes.md", ""),
376
+ created_at=stats.st_mtime,
377
+ size=stats.st_size,
378
+ )
379
+ )
380
+
381
+ notes.sort(key=lambda item: item.created_at, reverse=True)
382
+ return notes