Ahmed Mostafa commited on
Commit
bc3aca1
·
2 Parent(s): bd357e4c342700

feat: implement FastAPI structure, Supadata integration, and summarization schemas

Browse files
Dockerfile CHANGED
@@ -1,44 +1,25 @@
1
  # 1. اختيار النسخة الأساسية
2
  FROM python:3.10-slim
3
 
4
- # 2. تسطيب برامج النظام (ffmpeg للتعامل مع الصوت و curl لتحميل Node.js)
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
  ffmpeg \
7
  curl \
8
- git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # 3. تسطيب Node.js 20 (مهم جداً عشان حل شفرات يوتيوب)
12
- RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
13
- && apt-get install -y nodejs \
14
- && rm -rf /var/lib/apt/lists/*
15
-
16
- # 4. تجهيز فولدر المشروع
17
  WORKDIR /app
18
 
19
- # 5. تسطيب مكتبات بايثون
20
  COPY requirements.txt .
21
  RUN pip install --no-cache-dir -r requirements.txt
22
 
23
- # 6. تحميل وبناء سيرفر bgutil (خبير فك الشفرات)
24
- ARG BGUTIL_VERSION=1.3.1
25
- RUN git clone --depth 1 --branch ${BGUTIL_VERSION} \
26
- https://github.com/Brainicism/bgutil-ytdlp-pot-provider.git \
27
- /opt/bgutil-provider \
28
- && cd /opt/bgutil-provider/server \
29
- && npm ci \
30
- && npx tsc \
31
- && echo "✅ bgutil POT server compiled successfully"
32
-
33
- # 7. تسطيب الـ Plugin اللي بيربط yt-dlp بالسيرفر اللي فوق
34
- RUN pip install --no-cache-dir "bgutil-ytdlp-pot-provider==${BGUTIL_VERSION}"
35
-
36
- # 8. نسخ باقي ملفات المشروع
37
  COPY . .
38
 
39
- # 9. تضبيط الصلاحيات عشان Hugging Face (مهم جداً عشان السيرفر ميدي لكش Error)
40
- RUN chown -R 1000:1000 /app /opt/bgutil-provider
41
  USER 1000
42
 
43
- # 10. أمر تشغيل السيرفر الأساسي
44
  CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  # 1. اختيار النسخة الأساسية
2
  FROM python:3.10-slim
3
 
4
+ # 2. تسطيب برامج النظام (ffmpeg للتعامل مع الصوت)
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
  ffmpeg \
7
  curl \
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
+ # 3. تجهيز فولدر المشروع
 
 
 
 
 
11
  WORKDIR /app
12
 
13
+ # 4. تسطيب مكتبات بايثون
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
17
+ # 5. نسخ باقي ملفات المشروع
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  COPY . .
19
 
20
+ # 6. تضبيط الصلاحيات عشان Hugging Face (مهم جداً عشان السيرفر ميدي لكش Error)
21
+ RUN chown -R 1000:1000 /app
22
  USER 1000
23
 
24
+ # 7. أمر تشغيل السيرفر الأساسي
25
  CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py CHANGED
@@ -1,16 +1,13 @@
1
  from contextlib import asynccontextmanager
2
  from fastapi import FastAPI
3
- from src.api.pot_server import pot_server # استدعاء المدير اللي عملناه
4
 
5
  @asynccontextmanager
6
  async def lifespan(app: FastAPI):
7
  # الجزء ده بيتنفذ أول ما السيرفر يفتح
8
- print("🚀 Starting POT solver server...")
9
- pot_server.start()
10
  yield
11
  # الجزء ده بيتنفذ لما السيرفر يقفل
12
- print("🛑 Stopping POT solver server...")
13
- pot_server.stop()
14
 
15
  # تعريف الـ app مع إضافة الـ lifespan
16
  app = FastAPI(lifespan=lifespan)
 
1
  from contextlib import asynccontextmanager
2
  from fastapi import FastAPI
 
3
 
4
  @asynccontextmanager
5
  async def lifespan(app: FastAPI):
6
  # الجزء ده بيتنفذ أول ما السيرفر يفتح
7
+ print("🚀 AIdea API starting up...")
 
8
  yield
9
  # الجزء ده بيتنفذ لما السيرفر يقفل
10
+ print("🛑 AIdea API shutting down...")
 
11
 
12
  # تعريف الـ app مع إضافة الـ lifespan
13
  app = FastAPI(lifespan=lifespan)
pyproject.toml CHANGED
@@ -6,7 +6,6 @@ readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
  "aiofiles==23.2.1",
9
- "assemblyai>=0.30.0",
10
  "asyncpg==0.31.0",
11
  "bcrypt==4.1.2",
12
  "email-validator>=2.3.0",
@@ -32,5 +31,10 @@ dependencies = [
32
  "torch>=2.10.0",
33
  "torchaudio>=2.10.0",
34
  "uvicorn[standard]==0.27.0",
35
- "yt-dlp==2024.12.23",
36
  ]
 
 
 
 
 
 
 
6
  requires-python = ">=3.10"
7
  dependencies = [
8
  "aiofiles==23.2.1",
 
9
  "asyncpg==0.31.0",
10
  "bcrypt==4.1.2",
11
  "email-validator>=2.3.0",
 
31
  "torch>=2.10.0",
32
  "torchaudio>=2.10.0",
33
  "uvicorn[standard]==0.27.0",
 
34
  ]
35
+
36
+ [tool.pyright]
37
+ # The project uses `src.xxx` imports resolved from the repo root,
38
+ # NOT from inside `src/`. Tell Pyright to add "." as an extra
39
+ # search path so it finds `src/` as a package.
40
+ extraPaths = ["."]
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
2
  assemblyai>=0.30.0
3
  yt-dlp>=2025.05.22
@@ -6,6 +7,9 @@ youtube-transcript-api==0.6.2
6
  curl_cffi
7
 
8
  # --- AI, LLMs & Transcription Fallback ---
 
 
 
9
  openai-whisper==20250625
10
  torch
11
  torchaudio
@@ -41,4 +45,9 @@ firebase-admin==6.5.0
41
  dnspython
42
  pydub==0.25.1
43
  ffmpeg-python
44
- groq>=0.9.0
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
3
  assemblyai>=0.30.0
4
  yt-dlp>=2025.05.22
 
7
  curl_cffi
8
 
9
  # --- AI, LLMs & Transcription Fallback ---
10
+ =======
11
+ # --- AI, LLMs & Transcription ---
12
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
13
  openai-whisper==20250625
14
  torch
15
  torchaudio
 
45
  dnspython
46
  pydub==0.25.1
47
  ffmpeg-python
48
+ groq>=0.9.0
49
+ pytubefix
50
+
51
+ # --- ML & Recommendations ---
52
+ # keybert
53
+ # sentence-transformers
run.py CHANGED
@@ -17,25 +17,7 @@ logger = setup_logger(__name__)
17
 
18
  def check_environment():
19
  """Log key dependency versions to confirm runtime environment."""
20
- # Check Node.js
21
- try:
22
- node_version = subprocess.check_output(
23
- ["node", "--version"], stderr=subprocess.STDOUT
24
- ).decode().strip()
25
- logger.info(f"✅ Node.js available: {node_version} — yt-dlp JS challenges will be solved")
26
- except (subprocess.CalledProcessError, FileNotFoundError):
27
- logger.warning("❌ Node.js NOT found — yt-dlp will fail to solve JS challenges. Add 'nodejs' to Dockerfile.")
28
-
29
- # Check yt-dlp
30
- try:
31
- ytdlp_version = subprocess.check_output(
32
- ["yt-dlp", "--version"], stderr=subprocess.STDOUT
33
- ).decode().strip()
34
- logger.info(f"✅ yt-dlp version: {ytdlp_version}")
35
- except (subprocess.CalledProcessError, FileNotFoundError):
36
- logger.warning("❌ yt-dlp not found in PATH")
37
-
38
- # Check ffmpeg
39
  try:
40
  ffmpeg_out = subprocess.check_output(
41
  ["ffmpeg", "-version"], stderr=subprocess.STDOUT
@@ -44,6 +26,13 @@ def check_environment():
44
  except (subprocess.CalledProcessError, FileNotFoundError):
45
  logger.warning("❌ ffmpeg NOT found — audio extraction will fail")
46
 
 
 
 
 
 
 
 
47
 
48
  def run_server():
49
  """Start the FastAPI server with CORS enabled for Flutter Web."""
@@ -71,22 +60,13 @@ def run_server():
71
 
72
 
73
  def run_cli(youtube_url: str, output_file: str = None):
74
- from src.api.pot_server import pot_server
75
- from src.api.downloader import YouTubeDownloader
76
- # ... باقي الـ imports
77
 
78
  check_environment()
79
-
80
- # تشغيل خبير الشفرات قبل التحميل
81
- pot_server.start()
82
 
83
- try:
84
- # كود التحميل بتاعك هنا
85
- downloader = YouTubeDownloader()
86
- # ...
87
- finally:
88
- # قفل السيرفر بعد ما يخلص
89
- pot_server.stop()
90
 
91
 
92
  def main():
 
17
 
18
  def check_environment():
19
  """Log key dependency versions to confirm runtime environment."""
20
+ # Check ffmpeg (still used by audio processing utilities)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
22
  ffmpeg_out = subprocess.check_output(
23
  ["ffmpeg", "-version"], stderr=subprocess.STDOUT
 
26
  except (subprocess.CalledProcessError, FileNotFoundError):
27
  logger.warning("❌ ffmpeg NOT found — audio extraction will fail")
28
 
29
+ # Verify Supadata API key is configured
30
+ supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
31
+ if supadata_key:
32
+ logger.info("✅ SUPADATA_API_KEY is set")
33
+ else:
34
+ logger.warning("❌ SUPADATA_API_KEY is NOT set — transcript extraction will fail")
35
+
36
 
37
  def run_server():
38
  """Start the FastAPI server with CORS enabled for Flutter Web."""
 
60
 
61
 
62
  def run_cli(youtube_url: str, output_file: str = None):
63
+ from src.transcription.downloader import YouTubeDownloader
 
 
64
 
65
  check_environment()
 
 
 
66
 
67
+ downloader = YouTubeDownloader()
68
+ transcript = downloader.get_transcript(youtube_url)
69
+ print(transcript)
 
 
 
 
70
 
71
 
72
  def main():
src/api/main.py CHANGED
@@ -4,8 +4,6 @@ from contextlib import asynccontextmanager
4
  from fastapi import FastAPI
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
- # POT Server and Routers
8
- from src.api.pot_server import pot_server
9
  from src.api.auth_routes import router as auth_router
10
  from src.api.notes_routes import router as notes_router
11
  from src.api.recommendation_routes import router as recommendation_router
@@ -16,11 +14,9 @@ logger = setup_logger(__name__)
16
 
17
  @asynccontextmanager
18
  async def lifespan(app: FastAPI):
19
- print("🚀 Lifespan: Starting POT solver server (bgutil v1.3.1)...")
20
- pot_server.start()
21
  yield
22
- print("🛑 Lifespan: Stopping POT solver server...")
23
- pot_server.stop()
24
 
25
  app = FastAPI(
26
  title="AIdea API",
@@ -45,7 +41,6 @@ def read_root():
45
  return {
46
  "status": "online",
47
  "message": "Welcome to AIdea API! Everything is working perfectly.",
48
- "pot_server": "running"
49
  }
50
 
51
  @app.get("/health")
@@ -86,11 +81,9 @@ async def health_check():
86
  connectivity[url] = f"Failed: {repr(e)}"
87
 
88
  return {
89
- "status": "v6-online",
90
  "dnspython": has_dnspython,
91
  "dns": dns_results,
92
  "connectivity": connectivity,
93
- "pot_running": pot_server.is_running(),
94
  "timestamp": datetime.now()
95
  }
96
-
 
4
  from fastapi import FastAPI
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
 
 
7
  from src.api.auth_routes import router as auth_router
8
  from src.api.notes_routes import router as notes_router
9
  from src.api.recommendation_routes import router as recommendation_router
 
14
 
15
  @asynccontextmanager
16
  async def lifespan(app: FastAPI):
17
+ logger.info("🚀 AIdea API starting up...")
 
18
  yield
19
+ logger.info("🛑 AIdea API shutting down...")
 
20
 
21
  app = FastAPI(
22
  title="AIdea API",
 
41
  return {
42
  "status": "online",
43
  "message": "Welcome to AIdea API! Everything is working perfectly.",
 
44
  }
45
 
46
  @app.get("/health")
 
81
  connectivity[url] = f"Failed: {repr(e)}"
82
 
83
  return {
84
+ "status": "v7-supadata-only",
85
  "dnspython": has_dnspython,
86
  "dns": dns_results,
87
  "connectivity": connectivity,
 
88
  "timestamp": datetime.now()
89
  }
 
src/api/notes_routes.py CHANGED
@@ -11,9 +11,20 @@ from typing import Dict, List
11
  from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
12
  from pydantic import BaseModel, HttpUrl
13
 
 
14
  from src.api.downloader import YouTubeDownloader
15
  from src.auth.dependencies import get_current_user
16
  from src.db.models import User
 
 
 
 
 
 
 
 
 
 
17
  from src.summarization.note_generator import NoteGenerator
18
  from src.transcription.whisper_transcriber import WhisperTranscriber
19
  from src.utils.config import settings
@@ -25,10 +36,16 @@ router = APIRouter(tags=["Notes"])
25
  tasks: Dict[str, Dict] = {}
26
 
27
 
 
28
  def _set_task_status(task_id: str, status: str, message: str) -> None:
29
  tasks[task_id]["status"] = status
30
  tasks[task_id]["message"] = message
31
 
 
 
 
 
 
32
 
33
  def _extract_video_id(url: str) -> str:
34
  """Extract the 11-character YouTube video ID from any URL format."""
@@ -36,6 +53,7 @@ def _extract_video_id(url: str) -> str:
36
  return match.group(1) if match else ""
37
 
38
 
 
39
  def _use_supadata_first_strategy() -> bool:
40
  return settings.youtube_transcript_strategy == "supadata_first"
41
 
@@ -50,12 +68,46 @@ def _is_fast_fail_ssl_error(exc: Exception) -> bool:
50
  "EOF occurred in violation of protocol",
51
  )
52
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  def _duration_via_supadata(video_id: str) -> int:
56
  """
 
57
  Strategy 2: use Supadata transcript segments and estimate duration from the
58
  last segment timestamp.
 
 
 
 
 
59
  """
60
  api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
61
  if not api_key:
@@ -66,6 +118,7 @@ def _duration_via_supadata(video_id: str) -> int:
66
  f"https://api.supadata.ai/v1/youtube/transcript"
67
  f"?url=https://www.youtube.com/watch?v={video_id}"
68
  )
 
69
  resp = curl_requests.get(
70
  api_url,
71
  headers={
@@ -149,6 +202,30 @@ def _duration_via_html_scrape(url: str) -> int:
149
  except (json.JSONDecodeError, AttributeError) as exc:
150
  logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  return 0
153
 
154
 
@@ -158,6 +235,7 @@ def get_youtube_duration(
158
  strategy: str | None = None,
159
  ) -> int:
160
  """
 
161
  Robustly fetch the YouTube video duration in seconds using a waterfall (Supadata -> Scraping).
162
  """
163
  video_id = _extract_video_id(url)
@@ -175,12 +253,23 @@ def get_youtube_duration(
175
  return duration
176
 
177
  logger.warning("[duration] All strategies exhausted for: %s", url)
 
 
 
 
 
 
 
 
 
 
178
  return 0
179
 
180
 
181
  class GenerateNotesRequest(BaseModel):
182
  youtube_url: HttpUrl
183
  language: str = "en"
 
184
 
185
 
186
  class TaskResponse(BaseModel):
@@ -220,6 +309,10 @@ async def generate_note(
220
  str(request.youtube_url),
221
  request.language,
222
  user_id,
 
 
 
 
223
  )
224
 
225
  return TaskResponse(
@@ -236,10 +329,17 @@ async def get_task_status(task_id: str):
236
  return tasks[task_id]
237
 
238
 
239
- async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
 
 
 
 
 
 
240
  downloader = YouTubeDownloader()
241
 
242
  try:
 
243
  video_id = _extract_video_id(youtube_url)
244
  video_title = "YouTube Video"
245
 
@@ -280,6 +380,30 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
280
  "ai_processing",
281
  "Generating intelligent summary...",
282
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  note_gen = NoteGenerator()
284
  summary_json = note_gen.generateSummary(transcript_text, video_title)
285
  resolved_video_title = video_title
@@ -305,6 +429,10 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
305
  if isinstance(seg, dict) and seg.get("key_insight")
306
  ]
307
 
 
 
 
 
308
  from src.summarization.topic_classifier import classify_topics
309
 
310
  _set_task_status(
@@ -315,11 +443,17 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
315
  raw_topics = summary_json.get("topics", [])
316
  categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
317
 
 
318
  _set_task_status(task_id, "complete", "Generation completed successfully.")
 
 
 
 
319
  tasks[task_id]["notes"] = final_markdown
320
  tasks[task_id]["topics"] = categories
321
  tasks[task_id]["category"] = categories
322
  tasks[task_id]["keyPoints"] = key_points_list
 
323
  tasks[task_id]["videoTitle"] = resolved_video_title
324
  tasks[task_id]["thumbnail"] = (
325
  f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
@@ -370,6 +504,25 @@ def _transcribe_audio_fallback(
370
  finally:
371
  if audio_path is not None:
372
  downloader.cleanup(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
 
375
  @router.get("/generated", response_model=List[GeneratedNoteFile])
 
11
  from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
12
  from pydantic import BaseModel, HttpUrl
13
 
14
+ <<<<<<< HEAD
15
  from src.api.downloader import YouTubeDownloader
16
  from src.auth.dependencies import get_current_user
17
  from src.db.models import User
18
+ =======
19
+ from src.db.firebase import get_firebase_db
20
+ from src.db.models import User, Note
21
+ from src.auth.dependencies import get_current_user
22
+ from src.utils.logger import setup_logger
23
+ from src.utils.config import settings
24
+
25
+ # --- استدعاء أدوات المعالجة (النسخة الجديدة) ---
26
+ from src.transcription.downloader import YouTubeDownloader, NoTranscriptError
27
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
28
  from src.summarization.note_generator import NoteGenerator
29
  from src.transcription.whisper_transcriber import WhisperTranscriber
30
  from src.utils.config import settings
 
36
  tasks: Dict[str, Dict] = {}
37
 
38
 
39
+ <<<<<<< HEAD
40
  def _set_task_status(task_id: str, status: str, message: str) -> None:
41
  tasks[task_id]["status"] = status
42
  tasks[task_id]["message"] = message
43
 
44
+ =======
45
+ # ==========================================
46
+ # ⏱️ YouTube Duration & Metadata (Supadata-only)
47
+ # ==========================================
48
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
49
 
50
  def _extract_video_id(url: str) -> str:
51
  """Extract the 11-character YouTube video ID from any URL format."""
 
53
  return match.group(1) if match else ""
54
 
55
 
56
+ <<<<<<< HEAD
57
  def _use_supadata_first_strategy() -> bool:
58
  return settings.youtube_transcript_strategy == "supadata_first"
59
 
 
68
  "EOF occurred in violation of protocol",
69
  )
70
  )
71
+ =======
72
+ def _fetch_video_title(url: str) -> str:
73
+ """
74
+ Fetch the real video title via YouTube's oEmbed API.
75
+ Falls back to 'YouTube Video' on any failure.
76
+ """
77
+ try:
78
+ oembed_url = (
79
+ f"https://www.youtube.com/oembed"
80
+ f"?url={url}&format=json"
81
+ )
82
+ req = urllib.request.Request(oembed_url, headers={
83
+ "User-Agent": (
84
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
85
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
86
+ "Chrome/124.0.0.0 Safari/537.36"
87
+ ),
88
+ })
89
+ with urllib.request.urlopen(req, timeout=10) as resp:
90
+ data = json.loads(resp.read())
91
+ title = data.get("title", "").strip()
92
+ if title:
93
+ logger.info("✅ Fetched video title via oEmbed: %s", title)
94
+ return title
95
+ except Exception as e:
96
+ logger.warning("⚠️ oEmbed title fetch failed, using fallback: %s", e)
97
+ return "YouTube Video"
98
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
99
 
100
 
101
  def _duration_via_supadata(video_id: str) -> int:
102
  """
103
+ <<<<<<< HEAD
104
  Strategy 2: use Supadata transcript segments and estimate duration from the
105
  last segment timestamp.
106
+ =======
107
+ Fetch approximate video duration via the Supadata transcript API.
108
+ The last segment's offset gives a close approximation of the duration.
109
+ Returns duration in seconds, or 0 on failure.
110
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
111
  """
112
  api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
113
  if not api_key:
 
118
  f"https://api.supadata.ai/v1/youtube/transcript"
119
  f"?url=https://www.youtube.com/watch?v={video_id}"
120
  )
121
+ <<<<<<< HEAD
122
  resp = curl_requests.get(
123
  api_url,
124
  headers={
 
202
  except (json.JSONDecodeError, AttributeError) as exc:
203
  logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
204
 
205
+ =======
206
+ req = urllib.request.Request(api_url, headers={
207
+ "x-api-key": api_key,
208
+ "User-Agent": (
209
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
210
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
211
+ "Chrome/124.0.0.0 Safari/537.36"
212
+ ),
213
+ })
214
+ with urllib.request.urlopen(req, timeout=20) as resp:
215
+ data = json.loads(resp.read())
216
+ # Supadata returns segments with "offset" in ms — last one ≈ total duration
217
+ segments = data.get("segments") or data.get("content", [])
218
+ if isinstance(segments, list) and segments:
219
+ last = segments[-1]
220
+ offset_ms = last.get("offset", 0) or last.get("start", 0)
221
+ dur_ms = last.get("duration", 0) or last.get("dur", 0)
222
+ total_s = (int(offset_ms) + int(dur_ms)) // 1000
223
+ if total_s > 0:
224
+ logger.info("⏱️ [supadata] duration≈%ds", total_s)
225
+ return total_s
226
+ except Exception as e:
227
+ logger.warning("⚠️ [supadata] duration fetch failed: %s", e)
228
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
229
  return 0
230
 
231
 
 
235
  strategy: str | None = None,
236
  ) -> int:
237
  """
238
+ <<<<<<< HEAD
239
  Robustly fetch the YouTube video duration in seconds using a waterfall (Supadata -> Scraping).
240
  """
241
  video_id = _extract_video_id(url)
 
253
  return duration
254
 
255
  logger.warning("[duration] All strategies exhausted for: %s", url)
256
+ =======
257
+ Fetch the YouTube video duration in seconds via Supadata.
258
+ Returns 0 if the duration cannot be determined.
259
+ """
260
+ video_id = _extract_video_id(url)
261
+ if video_id:
262
+ return _duration_via_supadata(video_id)
263
+
264
+ logger.warning("⚠️ [duration] Could not extract video ID from: %s", url)
265
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
266
  return 0
267
 
268
 
269
  class GenerateNotesRequest(BaseModel):
270
  youtube_url: HttpUrl
271
  language: str = "en"
272
+ deep_scan: bool = False
273
 
274
 
275
  class TaskResponse(BaseModel):
 
309
  str(request.youtube_url),
310
  request.language,
311
  user_id,
312
+ <<<<<<< HEAD
313
+ =======
314
+ request.deep_scan,
315
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
316
  )
317
 
318
  return TaskResponse(
 
329
  return tasks[task_id]
330
 
331
 
332
+ async def process_video_task(
333
+ task_id: str,
334
+ youtube_url: str,
335
+ language: str,
336
+ user_id: str,
337
+ deep_scan: bool = False,
338
+ ):
339
  downloader = YouTubeDownloader()
340
 
341
  try:
342
+ <<<<<<< HEAD
343
  video_id = _extract_video_id(youtube_url)
344
  video_title = "YouTube Video"
345
 
 
380
  "ai_processing",
381
  "Generating intelligent summary...",
382
  )
383
+ =======
384
+ # Extract video ID for thumbnail
385
+ video_id_match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", youtube_url)
386
+ video_id = video_id_match.group(1) if video_id_match else ""
387
+
388
+ # Fetch real video title via YouTube oEmbed API
389
+ video_title = _fetch_video_title(youtube_url)
390
+
391
+ # ── TRANSCRIPT EXTRACTION ───────────────────────────────────
392
+ if deep_scan:
393
+ # Deep Scan: download audio → Groq Whisper
394
+ tasks[task_id]["status"] = "transcribing"
395
+ tasks[task_id]["message"] = "Deep Scan: downloading audio..."
396
+ transcript_text = downloader.deep_scan_transcript(youtube_url)
397
+ else:
398
+ # Default: fast Supadata subtitle extraction
399
+ tasks[task_id]["status"] = "transcribing"
400
+ tasks[task_id]["message"] = "Fetching transcript via Supadata..."
401
+ transcript_text = downloader.get_transcript(youtube_url)
402
+
403
+ # ── AI SUMMARIZATION ────────────────────────────────────────
404
+ tasks[task_id]["status"] = "generating_notes"
405
+ tasks[task_id]["message"] = "AI is generating your notes..."
406
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
407
  note_gen = NoteGenerator()
408
  summary_json = note_gen.generateSummary(transcript_text, video_title)
409
  resolved_video_title = video_title
 
429
  if isinstance(seg, dict) and seg.get("key_insight")
430
  ]
431
 
432
+ <<<<<<< HEAD
433
+ =======
434
+ # ── CATEGORIZATION ──────────────────────────────────────────
435
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
436
  from src.summarization.topic_classifier import classify_topics
437
 
438
  _set_task_status(
 
443
  raw_topics = summary_json.get("topics", [])
444
  categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
445
 
446
+ <<<<<<< HEAD
447
  _set_task_status(task_id, "complete", "Generation completed successfully.")
448
+ =======
449
+ # ── RETURN RESULTS ──────────────────────────────────────────
450
+ tasks[task_id]["status"] = "completed"
451
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
452
  tasks[task_id]["notes"] = final_markdown
453
  tasks[task_id]["topics"] = categories
454
  tasks[task_id]["category"] = categories
455
  tasks[task_id]["keyPoints"] = key_points_list
456
+ <<<<<<< HEAD
457
  tasks[task_id]["videoTitle"] = resolved_video_title
458
  tasks[task_id]["thumbnail"] = (
459
  f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
 
504
  finally:
505
  if audio_path is not None:
506
  downloader.cleanup(audio_path)
507
+ =======
508
+ tasks[task_id]["suggestedCategory"] = summary_json.get("suggested_category", "")
509
+ logger.info("✅ Task %s completed successfully!", task_id)
510
+
511
+ except NoTranscriptError as e:
512
+ # Video has no subtitles — signal the frontend to offer Deep Scan
513
+ logger.warning("⚠️ Task %s: no transcript available — %s", task_id, e)
514
+ tasks[task_id]["status"] = "failed"
515
+ tasks[task_id]["error_code"] = "NO_TRANSCRIPT"
516
+ tasks[task_id]["message"] = (
517
+ "This video does not have subtitles. "
518
+ "Use Deep Scan to extract text from the audio."
519
+ )
520
+
521
+ except Exception as e:
522
+ logger.error("❌ Task %s failed: %s", task_id, e)
523
+ tasks[task_id]["status"] = "failed"
524
+ tasks[task_id]["message"] = str(e)
525
+ >>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
526
 
527
 
528
  @router.get("/generated", response_model=List[GeneratedNoteFile])
src/audio/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (168 Bytes)
 
src/audio/__pycache__/__init__.cpython-314.pyc DELETED
Binary file (170 Bytes)
 
src/audio/__pycache__/downloader.cpython-312.pyc DELETED
Binary file (7.2 kB)
 
src/audio/__pycache__/downloader.cpython-314.pyc DELETED
Binary file (8.06 kB)
 
src/recommendation/recommender.py CHANGED
@@ -1,86 +1,146 @@
1
  import asyncio
 
2
  from typing import List, Dict, Optional
3
  from googleapiclient.discovery import build
4
- from src import db
5
  from src.utils.logger import setup_logger
6
  import random
7
- import os
8
- from dotenv import load_dotenv
9
- load_dotenv()
10
 
11
  logger = setup_logger(__name__)
12
 
13
 
 
 
14
  class RecommendationService:
15
  """
16
  Service for suggesting videos based on user's saved notes.
17
- Uses YouTube Search API for recommendations.
 
 
 
18
  """
19
 
20
  def __init__(self, api_key: Optional[str] = None):
21
  self.api_key = "AIzaSyA3erB-Lxd5SOoBOXaumOCVaEr3TcgYG60"
22
  self.youtube = build("youtube", "v3", developerKey=self.api_key)
23
-
24
- async def get_recommendations_for_user(
25
- self, db, user_id: str, limit: int = 5
26
- ) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  """
28
- Get recommendations based on user's note history in Firebase.
29
  """
30
- logger.info(f"📚 Fetching notes for user: {user_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  try:
33
- notes_ref = (
34
- db.collection("notes")
35
- .where("userId", "==", user_id)
36
- .limit(10)
 
 
 
 
 
37
  )
38
- notes_docs = notes_ref.stream()
39
- notes = sorted(
40
- [doc.to_dict() for doc in notes_docs],
41
- key=lambda x: x.get("createdAt", 0),
42
- reverse=True
43
- )[:5]
44
- logger.info(f"📝 Found {len(notes)} notes for user")
 
45
  except Exception as e:
46
- logger.error(f" Failed to fetch notes from Firebase: {e}")
47
- notes = []
48
 
49
- if not notes:
50
- logger.info("⚠️ No notes found, returning general recommendations")
51
- return await self.get_youtube_recommendations("educational tutorials", limit)
52
-
53
- # Extract topics from note categories
54
- topics = []
55
- for n in notes[:5]:
56
- cat = n.get("category")
57
- if not cat:
58
- continue
59
- # check if cat is a list or a string
60
- if isinstance(cat, list):
61
- topics.extend([c for c in cat if c and c != "Uncategorized"])
62
- elif cat != "Uncategorized":
63
- topics.append(cat)
64
-
65
- if not topics:
66
- topics = [n.get("videoTitle", "") for n in notes[:3]]
67
-
68
- search_query = " ".join(topics[:3])
69
- logger.info(f"🔍 Search query built: {search_query}")
70
 
71
- return await self.get_youtube_recommendations(search_query, limit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  async def get_youtube_recommendations(
74
  self, query: str, limit: int = 5
75
  ) -> List[Dict]:
76
- """
77
- Search YouTube for videos based on a query.
78
- """
79
  if not query:
80
  return []
81
 
82
- enhanced_query = f"{query} educational lecture tutorial"
83
- logger.info(f"🎬 Searching YouTube for: {enhanced_query}")
84
 
85
  try:
86
  loop = asyncio.get_event_loop()
@@ -90,10 +150,11 @@ class RecommendationService:
90
  .list(
91
  q=enhanced_query,
92
  part="snippet",
93
- maxResults=limit*3, # fetch more to filter later
94
  type="video",
95
  relevanceLanguage="en",
96
  videoEmbeddable="true",
 
97
  )
98
  .execute(),
99
  )
@@ -112,12 +173,82 @@ class RecommendationService:
112
  "type": "youtube_video",
113
  }
114
  )
115
- logger.info(f"✅ Found video: {snippet['title']}")
116
 
117
- logger.info(f"🚀 Total videos fetched: {len(videos)}")
118
  random.shuffle(videos)
119
- return videos[:limit]
 
 
120
 
121
  except Exception as e:
122
  logger.error(f"❌ YouTube search failed: {e}")
123
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import asyncio
2
+ from collections import Counter
3
  from typing import List, Dict, Optional
4
  from googleapiclient.discovery import build
 
5
  from src.utils.logger import setup_logger
6
  import random
7
+ # import anthropic
8
+ from groq import Groq
 
9
 
10
  logger = setup_logger(__name__)
11
 
12
 
13
+
14
+
15
  class RecommendationService:
16
  """
17
  Service for suggesting videos based on user's saved notes.
18
+ Pipeline:
19
+ 1. Top 3 most-repeated categories across all user notes
20
+ 2. Extract key keywords from the latest note per category (via Claude)
21
+ 3. Build a YouTube search query and return recommendations
22
  """
23
 
24
  def __init__(self, api_key: Optional[str] = None):
25
  self.api_key = "AIzaSyA3erB-Lxd5SOoBOXaumOCVaEr3TcgYG60"
26
  self.youtube = build("youtube", "v3", developerKey=self.api_key)
27
+ self.groq_client = Groq(api_key="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") # هنا
28
+
29
+ # ──────────────────────────────────────────────
30
+ # Step 1: top 3 categories
31
+ # ──────────────────────────────────────────────
32
+ def _get_top_categories(self, notes: List[Dict], top_n: int = 3) -> List[str]:
33
+ """Count category frequency across all notes and return the top N."""
34
+ counter: Counter = Counter()
35
+ for note in notes:
36
+ cat = note.get("category")
37
+ if not cat:
38
+ continue
39
+ cats = cat if isinstance(cat, list) else [cat]
40
+ for c in cats:
41
+ if c and c != "Uncategorized":
42
+ counter[c] += 1
43
+
44
+ top = [cat for cat, _ in counter.most_common(top_n)]
45
+ logger.info(f"🏆 Top categories: {top}")
46
+ return top
47
+
48
+ # ──────────────────────────────────────────────
49
+ # Step 2: keywords from latest note per category
50
+ # ──────────────────────────────────────────────
51
+ def _latest_notes_per_category(
52
+ self, notes: List[Dict], categories: List[str], top_n: int = 2
53
+ ) -> Dict[str, List[Dict]]:
54
  """
55
+ return a dict mapping each category to its latest N notes, sorted by createdAt.
56
  """
57
+ buckets: Dict[str, List[Dict]] = {cat: [] for cat in categories}
58
+
59
+ for note in notes:
60
+ cat = note.get("category")
61
+ cats = cat if isinstance(cat, list) else [cat] if cat else []
62
+ for c in cats:
63
+ if c in buckets:
64
+ buckets[c].append(note)
65
+
66
+ # sort each category's notes by createdAt and keep top N
67
+ return {
68
+ cat: sorted(notes_list, key=lambda n: n.get("createdAt", 0), reverse=True)[:top_n]
69
+ for cat, notes_list in buckets.items()
70
+ }
71
+
72
+ async def _extract_keywords_with_claude(
73
+ self, notes: List[Dict], category: str # ← List بدل Dict
74
+ ) -> List[str]:
75
+
76
+ # combine all relevant text fields from the notes into one string for context
77
+ combined_content = "\n---\n".join([
78
+ note.get("content") or note.get("text") or note.get("videoTitle") or ""
79
+ for note in notes
80
+ ]).strip()
81
+
82
+ if not combined_content:
83
+ return [category]
84
+
85
+ prompt = (
86
+ f"You are a search-query assistant. "
87
+ f"Given the notes below (category: {category}), "
88
+ f"extract 3 to 5 concise English keywords or short phrases that best "
89
+ f"represent the core topic for a YouTube educational search. "
90
+ f"Reply with ONLY a JSON array of strings, no explanation.\n\n"
91
+ f"Notes:\n{combined_content[:2000]}" # ← زودي الحد شوية
92
+ )
93
 
94
  try:
95
+ loop = asyncio.get_event_loop()
96
+ # groq_client = Groq(api_key="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")
97
+ response = await loop.run_in_executor(
98
+ None,
99
+ lambda: self.groq_client.chat.completions.create(
100
+ model="llama-3.3-70b-versatile",
101
+ messages=[{"role": "user", "content": prompt}],
102
+ max_tokens=120,
103
+ )
104
  )
105
+ raw = response.choices[0].message.content.strip()
106
+ import json, re
107
+ # strip accidental markdown fences
108
+ raw = re.sub(r"```json|```", "", raw).strip()
109
+ keywords = json.loads(raw)
110
+ if isinstance(keywords, list):
111
+ logger.info(f"🔑 Keywords for '{category}': {keywords}")
112
+ return [str(k) for k in keywords[:5]]
113
  except Exception as e:
114
+ logger.warning(f"⚠️ Claude keyword extraction failed for '{category}': {e}")
 
115
 
116
+ return [category] # fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # ──────────────────────────────────────────────
119
+ # Step 3: build query & search YouTube
120
+ # ──────────────────────────────────────────────
121
+ async def _build_search_query(
122
+ self, category_keywords: Dict[str, List[str]]
123
+ ) -> str:
124
+ """
125
+ Merge keywords from each top category into one balanced search query.
126
+ Takes up to 2 keywords per category to keep the query focused.
127
+ """
128
+ parts = []
129
+ for keywords in category_keywords.values():
130
+ parts.extend(keywords[:2])
131
+ query = " OR ".join(parts[:6]) # YouTube search works best under ~60 chars
132
+ logger.info(f"🔍 Final search query: {query}")
133
+ return query
134
 
135
  async def get_youtube_recommendations(
136
  self, query: str, limit: int = 5
137
  ) -> List[Dict]:
138
+ """Search YouTube for educational videos matching the query."""
 
 
139
  if not query:
140
  return []
141
 
142
+ enhanced_query = f"{query} tutorial "
143
+ logger.info(f"🎬 Searching YouTube: {enhanced_query}")
144
 
145
  try:
146
  loop = asyncio.get_event_loop()
 
150
  .list(
151
  q=enhanced_query,
152
  part="snippet",
153
+ maxResults=limit * 3,
154
  type="video",
155
  relevanceLanguage="en",
156
  videoEmbeddable="true",
157
+ videoDuration="medium",
158
  )
159
  .execute(),
160
  )
 
173
  "type": "youtube_video",
174
  }
175
  )
 
176
 
 
177
  random.shuffle(videos)
178
+ result = videos[:limit]
179
+ logger.info(f"✅ Returning {len(result)} recommendations")
180
+ return result
181
 
182
  except Exception as e:
183
  logger.error(f"❌ YouTube search failed: {e}")
184
+ return []
185
+
186
+ # ──────────────────────────────────────────────
187
+ # Main entry point
188
+ # ──────────────────────────────────────────────
189
+ async def get_recommendations_for_user(
190
+ self, db, user_id: str, limit: int = 5
191
+ ) -> List[Dict]:
192
+ logger.info(f"📚 Fetching notes for user: {user_id}")
193
+
194
+ # ── Fetch notes ──────────────────────────
195
+ try:
196
+ notes_docs = (
197
+ db.collection("notes")
198
+ .where("userId", "==", user_id)
199
+ .stream()
200
+ )
201
+ notes = [doc.to_dict() for doc in notes_docs]
202
+ logger.info(f"📝 Found {len(notes)} notes")
203
+ except Exception as e:
204
+ logger.error(f"❌ Firebase fetch failed: {e}")
205
+ notes = []
206
+
207
+ if not notes:
208
+ logger.info("⚠️ No notes — falling back to general recommendations")
209
+ return await self.get_youtube_recommendations("educational tutorials", limit)
210
+
211
+ # ── Step 1: top 3 categories ─────────────
212
+ top_categories = self._get_top_categories(notes, top_n=3)
213
+
214
+ if not top_categories:
215
+ logger.info("⚠️ No valid categories — falling back")
216
+ return await self.get_youtube_recommendations("educational tutorials", limit)
217
+
218
+ # ── Step 2: keywords via Claude ──────────
219
+ latest_notes = self._latest_notes_per_category(notes, top_categories, top_n=2)
220
+
221
+ valid_categories = [
222
+ cat for cat in top_categories
223
+ if cat in latest_notes and latest_notes[cat]
224
+ ]
225
+
226
+ keyword_tasks = [
227
+ self._extract_keywords_with_claude(latest_notes[cat], cat)
228
+ for cat in valid_categories
229
+ ]
230
+
231
+ keyword_results = await asyncio.gather(*keyword_tasks)
232
+
233
+ category_keywords: Dict[str, List[str]] = {
234
+ cat: kws
235
+ for cat, kws in zip(valid_categories, keyword_results) # ✅ zip على نفس الليست
236
+ }
237
+ # ── Step 3: build query & recommend ──────
238
+ all_videos = []
239
+
240
+ for category, keywords in category_keywords.items():
241
+ query = " ".join(keywords[:3])
242
+
243
+ logger.info(f"🎯 Searching category: {category} | Query: {query}")
244
+
245
+ videos = await self.get_youtube_recommendations(query, limit=2)
246
+
247
+ for v in videos:
248
+ v["category"] = category
249
+
250
+ all_videos.extend(videos)
251
+
252
+ random.shuffle(all_videos)
253
+
254
+ return all_videos[:limit * 2]
src/services/__pycache__/categorizer.cpython-312.pyc DELETED
Binary file (2.53 kB)
 
src/summarization/note_generator.py CHANGED
@@ -1,6 +1,8 @@
1
  import json
2
  import os
3
- from typing import Dict, Optional
 
 
4
 
5
  from groq import Groq
6
  from pydantic import ValidationError
@@ -13,7 +15,27 @@ logger = setup_logger(__name__)
13
 
14
 
15
  # ─────────────────────────────────────────────────────────────────────────────
16
- # PROMPT TEMPLATES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # ─────────────────────────────────────────────────────────────────────────────
18
 
19
  _SUMMARY_SYSTEM = """
@@ -24,7 +46,7 @@ LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
24
  - Detect the primary language of the transcript.
25
  - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
26
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
27
- - Only the "detected_language" field itself is stated in English (e.g. "Arabic").
28
 
29
  TIMELINE RULES — STRICTLY ENFORCED:
30
  - Divide the transcript into chronological segments that follow its natural progression.
@@ -42,6 +64,12 @@ TOPICS RULE:
42
  - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
43
  - Do NOT use generic fixed categories.
44
 
 
 
 
 
 
 
45
  CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
46
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
47
  {
@@ -57,7 +85,8 @@ DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
57
  }
58
  ],
59
  "conclusion": "Final overall takeaway / closing conclusion",
60
- "topics": ["Topic1", "Topic2", "Topic3"]
 
61
  }
62
 
63
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -76,6 +105,109 @@ Return ONLY the exact JSON structure requested.
76
  """.strip()
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # ─────────────────────────────────────────────────────────────────────────────
80
  # LANGUAGE LABELS (simplified)
81
  # ─────────────────────────────────────────────────────────────────────────────
@@ -105,23 +237,121 @@ def _labels(language: str) -> dict:
105
  return _LABELS.get(language, _LABELS["English"])
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # ─────────────────────────────────────────────────────────────────────────────
109
  # NOTE GENERATOR
110
  # ─────────────────────────────────────────────────────────────────────────────
111
 
112
  class NoteGenerator:
113
- """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
 
 
 
 
 
 
 
 
 
 
114
 
115
  def __init__(self):
116
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
117
  self.client = Groq(api_key=self.api_key) if self.api_key else None
118
- self.model_id = "llama-3.3-70b-versatile"
119
- logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
 
 
 
 
 
 
 
 
120
 
121
- def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
 
 
 
 
 
 
122
  try:
123
  response = self.client.chat.completions.create(
124
- model=self.model_id,
125
  max_tokens=max_tokens,
126
  temperature=0.3,
127
  response_format={"type": "json_object"},
@@ -132,9 +362,11 @@ class NoteGenerator:
132
  )
133
  return response.choices[0].message.content
134
  except Exception as e:
135
- logger.error(f"❌ Groq API call failed: {e}")
136
  return None
137
 
 
 
138
  def _get_error_json(self, error_msg: str) -> Dict:
139
  return {
140
  "title": "Error in Generation",
@@ -143,31 +375,208 @@ class NoteGenerator:
143
  "segments": [],
144
  "conclusion": "",
145
  "topics": [],
 
146
  }
147
 
148
- def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
149
- """Generate structured JSON summary from transcript."""
150
- if not self.client:
151
- return self._get_error_json("Groq API Key missing.")
 
152
 
153
- logger.info(f"📝 Summary generation started via {self.model_id}")
154
  user_prompt = _SUMMARY_USER.format(
155
  video_title=video_title,
156
- transcript=transcript_text[:30000],
157
  )
158
 
159
  raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
160
  if raw is None:
161
- return self._get_error_json("Groq API call failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
 
 
 
 
 
 
163
  try:
164
- data = json.loads(raw)
165
  validated = SummarySchema(**data)
166
  return validated.model_dump()
167
  except (json.JSONDecodeError, ValidationError) as e:
168
- logger.error(f"❌ Schema validation failed: {e}")
169
  return self._get_error_json(f"Validation Error: {str(e)}")
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
172
  """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
173
  lang = json_notes.get("detected_language", "English")
 
1
  import json
2
  import os
3
+ import re
4
+ import time
5
+ from typing import Dict, List, Optional
6
 
7
  from groq import Groq
8
  from pydantic import ValidationError
 
15
 
16
 
17
  # ─────────────────────────────────────────────────────────────────────────────
18
+ # CONFIGURATION
19
+ # ─────────────────────────────────────────────────────────────────────────────
20
+
21
+ # Token threshold: below this, a single API call is used.
22
+ _SINGLE_PASS_TOKEN_LIMIT = 8_000
23
+
24
+ # Target chunk size for MAP phase (tokens). Kept small so that
25
+ # prompt + chunk + response stays well under the 12K TPM free-tier limit.
26
+ _CHUNK_TARGET_TOKENS = 2_500
27
+
28
+ # Model — unified for both MAP and REDUCE phases.
29
+ # llama-3.3-70b-versatile has 12K TPM on the free tier (the highest).
30
+ _MODEL_PRIMARY = "llama-3.3-70b-versatile"
31
+
32
+ # Maximum retries when a rate-limit (413 / 429) is hit.
33
+ _RATE_LIMIT_MAX_RETRIES = 3
34
+ _RATE_LIMIT_SLEEP_SECONDS = 60
35
+
36
+
37
+ # ─────────────────────────────────────────────────────────────────────────────
38
+ # PROMPT TEMPLATES — SINGLE-PASS (unchanged)
39
  # ─────────────────────────────────────────────────────────────────────────────
40
 
41
  _SUMMARY_SYSTEM = """
 
46
  - Detect the primary language of the transcript.
47
  - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
48
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
49
+ - Only the "detected_language" and "suggested_category" fields are stated in English.
50
 
51
  TIMELINE RULES — STRICTLY ENFORCED:
52
  - Divide the transcript into chronological segments that follow its natural progression.
 
64
  - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
65
  - Do NOT use generic fixed categories.
66
 
67
+ CATEGORY RULE:
68
+ - Provide a single, concise category label (1-2 words max) in English.
69
+ - This should be the most accurate high-level category for the video content.
70
+ - Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
71
+ - The suggested_category MUST always be in English regardless of the transcript language.
72
+
73
  CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
74
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
75
  {
 
85
  }
86
  ],
87
  "conclusion": "Final overall takeaway / closing conclusion",
88
+ "topics": ["Topic1", "Topic2", "Topic3"],
89
+ "suggested_category": "Programming"
90
  }
91
 
92
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 
105
  """.strip()
106
 
107
 
108
+ # ─────────────────────────────────────────────────────────────────────────────
109
+ # PROMPT TEMPLATES — MAP PHASE
110
+ # ─────────────────────────────────────────────────────────────────────────────
111
+
112
+ _MAP_SYSTEM = """
113
+ You are an expert educational content analyst.
114
+ You will receive ONE CHUNK of a longer video transcript.
115
+ Extract the key information from this chunk ONLY.
116
+
117
+ LANGUAGE RULE — CRITICAL:
118
+ - Detect the primary language of the text.
119
+ - Write ALL content fields in that SAME detected language.
120
+ - Only "detected_language" is stated in English.
121
+
122
+ Return a JSON object with this EXACT structure:
123
+ {
124
+ "detected_language": "English (or Arabic, etc.)",
125
+ "chunk_summary": "Concise summary of this chunk (3-5 sentences)",
126
+ "key_points": [
127
+ {
128
+ "title": "Short title for this point",
129
+ "detail": "1-2 sentence explanation",
130
+ "insight": "Key takeaway"
131
+ }
132
+ ],
133
+ "topics": ["Topic1", "Topic2"]
134
+ }
135
+
136
+ RULES:
137
+ - Extract 2-4 key points from this chunk.
138
+ - Topics should be specific (e.g. "Python", "Neural Networks"), not generic.
139
+ - OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
140
+ """.strip()
141
+
142
+ _MAP_USER = """
143
+ Video Title: {video_title}
144
+ Chunk {chunk_index} of {total_chunks}:
145
+
146
+ {chunk_text}
147
+
148
+ Extract the key information from this chunk. Return ONLY the JSON.
149
+ """.strip()
150
+
151
+
152
+ # ─────────────────────────────────────────────────────────────────────────────
153
+ # PROMPT TEMPLATES — REDUCE PHASE
154
+ # ─────────────────────────────────────────────────────────────────────────────
155
+
156
+ _REDUCE_SYSTEM = """
157
+ You are an expert educational content analyst and structured note-taking specialist.
158
+ You will receive INTERMEDIATE SUMMARIES from multiple chunks of a single video transcript.
159
+ Your job is to MERGE them into ONE final, cohesive, structured summary.
160
+
161
+ LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
162
+ - Use the detected language from the intermediate summaries.
163
+ - Every content field MUST be in that SAME language.
164
+ - Only "detected_language" and "suggested_category" are stated in English.
165
+
166
+ TIMELINE RULES — STRICTLY ENFORCED:
167
+ - Merge the chunk summaries into 3-7 chronological segments.
168
+ - Each segment MUST cover a distinct phase or theme; do NOT repeat topics.
169
+ - Segments must follow the natural progression of the video.
170
+ - Each segment must include: title, summary, key_insight, why_it_matters.
171
+
172
+ CATEGORY RULE:
173
+ - Provide a single, concise category label (1-2 words max) in English.
174
+ - This should be the most accurate high-level category for the video content.
175
+ - Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
176
+ - The suggested_category MUST always be in English regardless of the transcript language.
177
+
178
+ CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
179
+ {
180
+ "title": "Inferred video title in transcript language",
181
+ "detected_language": "English (or Arabic, etc.)",
182
+ "summary": "Concise overall summary (3-5 sentences)",
183
+ "segments": [
184
+ {
185
+ "title": "Segment title",
186
+ "summary": "What this section covers (2-3 sentences)",
187
+ "key_insight": "Most important point from this section",
188
+ "why_it_matters": "Why this is valuable (1-2 sentences)"
189
+ }
190
+ ],
191
+ "conclusion": "Final overall takeaway / closing conclusion",
192
+ "topics": ["Topic1", "Topic2", "Topic3"],
193
+ "suggested_category": "Programming"
194
+ }
195
+
196
+ OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
197
+ """.strip()
198
+
199
+ _REDUCE_USER = """
200
+ Video Title: {video_title}
201
+
202
+ The following are intermediate summaries extracted from {total_chunks} consecutive chunks
203
+ of the video transcript. Merge them into ONE cohesive final summary.
204
+
205
+ {merged_summaries}
206
+
207
+ Merge into 3-7 chronological segments. Return ONLY the final JSON structure.
208
+ """.strip()
209
+
210
+
211
  # ─────────────────────────────────────────────────────────────────────────────
212
  # LANGUAGE LABELS (simplified)
213
  # ─────────────────────────────────────────────────────────────────────────────
 
237
  return _LABELS.get(language, _LABELS["English"])
238
 
239
 
240
+ # ─────────────────────────────────────────────────────────────────────────────
241
+ # TOKEN UTILITIES
242
+ # ─────────────────────────────────────────────────────────────────────────────
243
+
244
+ def _estimate_tokens(text: str) -> int:
245
+ """
246
+ Lightweight token estimation using a word-count heuristic.
247
+
248
+ Production logs show that Groq's tokenizer produces ~2.5 tokens per
249
+ whitespace-delimited word for Arabic / mixed-script transcripts.
250
+ Using 2.5× as a conservative multiplier to avoid underestimation.
251
+ """
252
+ word_count = len(text.split())
253
+ return int(word_count * 2.5)
254
+
255
+
256
+ def _split_into_chunks(text: str, target_tokens: int = _CHUNK_TARGET_TOKENS) -> List[str]:
257
+ """
258
+ Split text into chunks of approximately `target_tokens` tokens each.
259
+
260
+ Splits on sentence boundaries (period + space, newline) to avoid
261
+ cutting mid-sentence. Falls back to word-level splitting if no
262
+ sentence boundaries are found within a chunk.
263
+ """
264
+ # Split into sentences (on ". " or newline)
265
+ sentences = re.split(r'(?<=[.!?])\s+|\n+', text)
266
+ sentences = [s.strip() for s in sentences if s.strip()]
267
+
268
+ chunks: List[str] = []
269
+ current_chunk: List[str] = []
270
+ current_tokens = 0
271
+
272
+ for sentence in sentences:
273
+ sentence_tokens = _estimate_tokens(sentence)
274
+
275
+ # If a single sentence exceeds the target, split by words
276
+ if sentence_tokens > target_tokens:
277
+ # Flush current chunk first
278
+ if current_chunk:
279
+ chunks.append(" ".join(current_chunk))
280
+ current_chunk = []
281
+ current_tokens = 0
282
+
283
+ words = sentence.split()
284
+ word_buffer: List[str] = []
285
+ buffer_tokens = 0
286
+ for word in words:
287
+ wt = _estimate_tokens(word)
288
+ if buffer_tokens + wt > target_tokens and word_buffer:
289
+ chunks.append(" ".join(word_buffer))
290
+ word_buffer = [word]
291
+ buffer_tokens = wt
292
+ else:
293
+ word_buffer.append(word)
294
+ buffer_tokens += wt
295
+ if word_buffer:
296
+ chunks.append(" ".join(word_buffer))
297
+ continue
298
+
299
+ if current_tokens + sentence_tokens > target_tokens and current_chunk:
300
+ chunks.append(" ".join(current_chunk))
301
+ current_chunk = [sentence]
302
+ current_tokens = sentence_tokens
303
+ else:
304
+ current_chunk.append(sentence)
305
+ current_tokens += sentence_tokens
306
+
307
+ # Don't forget the last chunk
308
+ if current_chunk:
309
+ chunks.append(" ".join(current_chunk))
310
+
311
+ return chunks
312
+
313
+
314
  # ─────────────────────────────────────────────────────────────────────────────
315
  # NOTE GENERATOR
316
  # ─────────────────────────────────────────────────────────────────────────────
317
 
318
  class NoteGenerator:
319
+ """
320
+ Generates structured study notes using Groq.
321
+
322
+ Automatically selects between:
323
+ - **Single-pass**: for short transcripts (< 8K tokens)
324
+ - **Map-Reduce**: for long transcripts (≥ 8K tokens), splitting into
325
+ chunks, summarizing each individually, then merging in a REDUCE pass.
326
+
327
+ Uses a single model (llama-3.3-70b-versatile) for all phases and
328
+ includes adaptive rate-limit retry (60s backoff on 413/429).
329
+ """
330
 
331
  def __init__(self):
332
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
333
  self.client = Groq(api_key=self.api_key) if self.api_key else None
334
+ self.model = _MODEL_PRIMARY
335
+ self.chunk_delay = float(
336
+ os.environ.get("GROQ_CHUNK_DELAY_SECONDS", "3")
337
+ )
338
+ logger.info(
339
+ "🚀 NoteGenerator v5.1 initialized — model: %s, delay: %.1fs",
340
+ self.model, self.chunk_delay,
341
+ )
342
+
343
+ # ── Low-level API call ──────────────────────────────────────────────
344
 
345
+ def _chat(
346
+ self,
347
+ system: str,
348
+ user: str,
349
+ max_tokens: int = 4096,
350
+ ) -> Optional[str]:
351
+ """Send a chat completion request to Groq."""
352
  try:
353
  response = self.client.chat.completions.create(
354
+ model=self.model,
355
  max_tokens=max_tokens,
356
  temperature=0.3,
357
  response_format={"type": "json_object"},
 
362
  )
363
  return response.choices[0].message.content
364
  except Exception as e:
365
+ logger.error("❌ Groq API call failed (model=%s): %s", self.model, e)
366
  return None
367
 
368
+ # ── Error fallback ──────────────────────────────────────────────────
369
+
370
  def _get_error_json(self, error_msg: str) -> Dict:
371
  return {
372
  "title": "Error in Generation",
 
375
  "segments": [],
376
  "conclusion": "",
377
  "topics": [],
378
+ "suggested_category": "",
379
  }
380
 
381
+ # ── Single-pass summarization (short transcripts) ───────────────────
382
+
383
+ def _single_pass(self, transcript_text: str, video_title: str) -> Dict:
384
+ """Process the entire transcript in one API call."""
385
+ logger.info("📝 Single-pass summarization via %s", self.model)
386
 
 
387
  user_prompt = _SUMMARY_USER.format(
388
  video_title=video_title,
389
+ transcript=transcript_text,
390
  )
391
 
392
  raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
393
  if raw is None:
394
+ return self._get_error_json("Groq API call failed (single-pass).")
395
+
396
+ return self._parse_and_validate(raw)
397
+
398
+ # ── Map-Reduce summarization (long transcripts) ─────────────────────
399
+
400
+ def _map_reduce(self, transcript_text: str, video_title: str) -> Dict:
401
+ """
402
+ Split transcript into chunks, summarize each (MAP), then merge (REDUCE).
403
+ """
404
+ chunks = _split_into_chunks(transcript_text)
405
+ total = len(chunks)
406
+ logger.info(
407
+ "🗺️ Map-Reduce activated: %d chunks (delay=%.1fs between calls)",
408
+ total, self.chunk_delay,
409
+ )
410
+
411
+ # ── MAP PHASE ───────────────────────────────────────────────────
412
+ intermediate_results: List[Dict] = []
413
+
414
+ for i, chunk in enumerate(chunks, start=1):
415
+ chunk_tokens = _estimate_tokens(chunk)
416
+ logger.info(
417
+ " 📦 MAP chunk %d/%d (~%d est. tokens)...", i, total, chunk_tokens,
418
+ )
419
+
420
+ user_prompt = _MAP_USER.format(
421
+ video_title=video_title,
422
+ chunk_index=i,
423
+ total_chunks=total,
424
+ chunk_text=chunk,
425
+ )
426
+
427
+ # Retry loop with adaptive backoff on rate-limit errors
428
+ raw = None
429
+ for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
430
+ raw = self._chat(
431
+ _MAP_SYSTEM, user_prompt,
432
+ max_tokens=2048,
433
+ )
434
+
435
+ if raw is not None:
436
+ break # success
437
+
438
+ # _chat() returns None on any exception. Check if it was a
439
+ # rate-limit error (413 / 429) by inspecting the last
440
+ # exception. We re-try with a 60s sleep.
441
+ logger.warning(
442
+ " ⚠️ MAP chunk %d/%d attempt %d/%d failed. "
443
+ "Sleeping %ds for TPM window reset...",
444
+ i, total, attempt, _RATE_LIMIT_MAX_RETRIES,
445
+ _RATE_LIMIT_SLEEP_SECONDS,
446
+ )
447
+ time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
448
+
449
+ if raw:
450
+ try:
451
+ parsed = json.loads(raw)
452
+ intermediate_results.append(parsed)
453
+ logger.info(" ✅ MAP chunk %d/%d done.", i, total)
454
+ except json.JSONDecodeError as e:
455
+ logger.warning(
456
+ " ⚠️ MAP chunk %d/%d returned invalid JSON: %s", i, total, e,
457
+ )
458
+ else:
459
+ logger.error(
460
+ " ❌ MAP chunk %d/%d failed after %d retries. Skipping.",
461
+ i, total, _RATE_LIMIT_MAX_RETRIES,
462
+ )
463
+
464
+ # Respect TPM limits — delay between consecutive API calls
465
+ if i < total and self.chunk_delay > 0:
466
+ logger.info(" ⏳ Sleeping %.1fs (TPM cooldown)...", self.chunk_delay)
467
+ time.sleep(self.chunk_delay)
468
+
469
+ if not intermediate_results:
470
+ return self._get_error_json(
471
+ "Map-Reduce failed: no chunks were successfully summarized."
472
+ )
473
+
474
+ # ── REDUCE PHASE ────────────────────────────────────────────────
475
+ logger.info("🔗 REDUCE phase: merging %d intermediate summaries...", len(intermediate_results))
476
+
477
+ # Build a readable merged text for the reduce prompt
478
+ merged_parts: List[str] = []
479
+ all_topics: List[str] = []
480
+ detected_lang = "English"
481
+
482
+ for idx, result in enumerate(intermediate_results, start=1):
483
+ detected_lang = result.get("detected_language", detected_lang)
484
+ chunk_summary = result.get("chunk_summary", "")
485
+ key_points = result.get("key_points", [])
486
+ topics = result.get("topics", [])
487
+ all_topics.extend(topics)
488
+
489
+ part = f"--- Chunk {idx} ---\n"
490
+ part += f"Summary: {chunk_summary}\n"
491
+ for kp in key_points:
492
+ if isinstance(kp, dict):
493
+ part += f"- {kp.get('title', '')}: {kp.get('detail', '')} "
494
+ part += f"(Insight: {kp.get('insight', '')})\n"
495
+ part += f"Topics: {', '.join(topics)}\n"
496
+ merged_parts.append(part)
497
+
498
+ merged_text = "\n".join(merged_parts)
499
+
500
+ # Check if the merged text itself is within single-pass limits
501
+ reduce_tokens = _estimate_tokens(merged_text)
502
+ logger.info("🔗 REDUCE input: ~%d tokens", reduce_tokens)
503
+
504
+ user_prompt = _REDUCE_USER.format(
505
+ video_title=video_title,
506
+ total_chunks=len(intermediate_results),
507
+ merged_summaries=merged_text,
508
+ )
509
+
510
+ # Sleep before REDUCE to ensure TPM cooldown from last MAP call
511
+ if self.chunk_delay > 0:
512
+ logger.info(" ⏳ Sleeping %.1fs before REDUCE call...", self.chunk_delay)
513
+ time.sleep(self.chunk_delay)
514
+
515
+ # REDUCE with retry on rate-limit
516
+ raw = None
517
+ for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
518
+ raw = self._chat(_REDUCE_SYSTEM, user_prompt, max_tokens=4096)
519
+ if raw is not None:
520
+ break
521
+ logger.warning(
522
+ " ⚠️ REDUCE attempt %d/%d failed. Sleeping %ds...",
523
+ attempt, _RATE_LIMIT_MAX_RETRIES, _RATE_LIMIT_SLEEP_SECONDS,
524
+ )
525
+ time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
526
+
527
+ if raw is None:
528
+ return self._get_error_json("Groq API call failed (REDUCE phase after retries).")
529
 
530
+ return self._parse_and_validate(raw)
531
+
532
+ # ── JSON parsing + schema validation ────────────────────────────────
533
+
534
+ def _parse_and_validate(self, raw_json: str) -> Dict:
535
+ """Parse raw JSON string and validate against SummarySchema."""
536
  try:
537
+ data = json.loads(raw_json)
538
  validated = SummarySchema(**data)
539
  return validated.model_dump()
540
  except (json.JSONDecodeError, ValidationError) as e:
541
+ logger.error("❌ Schema validation failed: %s", e)
542
  return self._get_error_json(f"Validation Error: {str(e)}")
543
 
544
+ # ── Public API (unchanged signature) ────────────────────────────────
545
+
546
+ def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
547
+ """
548
+ Generate structured JSON summary from transcript.
549
+
550
+ Automatically selects single-pass or Map-Reduce based on estimated
551
+ token count. The return type is always a Dict matching SummarySchema.
552
+ """
553
+ if not self.client:
554
+ return self._get_error_json("Groq API Key missing.")
555
+
556
+ # Estimate total tokens for the full prompt
557
+ full_prompt = _SUMMARY_USER.format(
558
+ video_title=video_title,
559
+ transcript=transcript_text,
560
+ )
561
+ total_tokens = _estimate_tokens(_SUMMARY_SYSTEM + full_prompt)
562
+
563
+ logger.info(
564
+ "📊 Token estimate: ~%d tokens (threshold: %d)",
565
+ total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
566
+ )
567
+
568
+ if total_tokens < _SINGLE_PASS_TOKEN_LIMIT:
569
+ return self._single_pass(transcript_text, video_title)
570
+ else:
571
+ logger.info(
572
+ "⚡ Transcript too large for single-pass (%d ≥ %d). "
573
+ "Activating Map-Reduce pipeline...",
574
+ total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
575
+ )
576
+ return self._map_reduce(transcript_text, video_title)
577
+
578
+ # ── Markdown formatting (unchanged) ─────────────────────────────────
579
+
580
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
581
  """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
582
  lang = json_notes.get("detected_language", "English")
src/summarization/schemas.py CHANGED
@@ -81,4 +81,13 @@ class SummarySchema(BaseModel):
81
  "Dynamically extracted topics discussed in the video."
82
  " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
83
  ),
 
 
 
 
 
 
 
 
 
84
  )
 
81
  "Dynamically extracted topics discussed in the video."
82
  " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
83
  ),
84
+ )
85
+
86
+ suggested_category: str = Field(
87
+ ...,
88
+ description=(
89
+ "A single, concise category label (1-2 words max) that best"
90
+ " describes the video content. Must always be in English."
91
+ " Examples: 'Programming', 'Finance', 'History', 'Psychology'."
92
+ ),
93
  )
src/transcription/downloader.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ import json
5
+ import tempfile
6
+ import urllib.request
7
+
8
+ from groq import Groq
9
+ from pydub import AudioSegment
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Groq Whisper free-tier file size limit (bytes)
15
+ _WHISPER_MAX_BYTES = 24 * 1024 * 1024 # 24 MB (safe margin under 25 MB)
16
+ _WHISPER_MODEL = "whisper-large-v3-turbo"
17
+
18
+
19
+ # ─────────────────────────────────────────────────────────────────────────────
20
+ # Custom Exceptions
21
+ # ─────────────────────────────────────────────────────────────────────────────
22
+
23
+ class NoTranscriptError(RuntimeError):
24
+ """Raised when a video has no subtitles / captions available."""
25
+ pass
26
+
27
+
28
+ # ─────────────────────────────────────────────────────────────────────────────
29
+ # YouTubeDownloader
30
+ # ─────────────────────────────────────────────────────────────────────────────
31
+
32
+ class YouTubeDownloader:
33
+ """Extracts YouTube transcripts via Supadata or Deep Scan (Groq Whisper)."""
34
+
35
+ def __init__(self):
36
+ self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
37
+ self._groq_key = os.environ.get("GROQ_API_KEY", "").strip()
38
+
39
+ # ── Primary path: Supadata transcript ─────────────────────────────
40
+
41
+ def get_transcript(self, url: str) -> str:
42
+ """
43
+ Fetch the full transcript for a YouTube video via Supadata.
44
+
45
+ Raises
46
+ ------
47
+ NoTranscriptError
48
+ If the video has no subtitles (Supadata returns empty content).
49
+ RuntimeError
50
+ If the API key is missing, request fails, or response is invalid.
51
+ """
52
+ video_id = self._extract_video_id(url)
53
+ logger.info("🔍 Fetching transcript for video ID: %s", video_id)
54
+
55
+ if not self._supadata_key:
56
+ raise RuntimeError(
57
+ "SUPADATA_API_KEY is not set. "
58
+ "Cannot fetch transcript without a valid API key."
59
+ )
60
+
61
+ clean_url = f"https://www.youtube.com/watch?v={video_id}"
62
+
63
+ headers = {
64
+ "x-api-key": self._supadata_key,
65
+ "User-Agent": (
66
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
67
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
68
+ "Chrome/124.0.0.0 Safari/537.36"
69
+ ),
70
+ }
71
+
72
+ api_url = (
73
+ f"https://api.supadata.ai/v1/youtube/transcript"
74
+ f"?url={clean_url}&text=true"
75
+ )
76
+
77
+ try:
78
+ req = urllib.request.Request(api_url, headers=headers)
79
+ with urllib.request.urlopen(req, timeout=30) as resp:
80
+ data = json.loads(resp.read())
81
+ text = data.get("content", "").strip()
82
+ if text:
83
+ logger.info(
84
+ "✅ Supadata transcript fetched (%d chars)", len(text)
85
+ )
86
+ return text
87
+
88
+ # Video exists but has no subtitles
89
+ raise NoTranscriptError(
90
+ f"No subtitles found for video {video_id}. "
91
+ "Deep scan required to extract audio."
92
+ )
93
+ except NoTranscriptError:
94
+ raise # re-raise without wrapping
95
+ except urllib.error.HTTPError as e:
96
+ logger.error("❌ Supadata HTTP error %d: %s", e.code, e.reason)
97
+ raise RuntimeError(
98
+ f"Supadata API returned HTTP {e.code} ({e.reason}) "
99
+ f"for video {video_id}."
100
+ ) from e
101
+ except urllib.error.URLError as e:
102
+ logger.error("❌ Supadata connection error: %s", e.reason)
103
+ raise RuntimeError(
104
+ f"Could not reach Supadata API: {e.reason}"
105
+ ) from e
106
+ except json.JSONDecodeError as e:
107
+ logger.error("❌ Supadata returned invalid JSON: %s", e)
108
+ raise RuntimeError(
109
+ "Supadata API returned a non-JSON response."
110
+ ) from e
111
+
112
+ # ── Deep Scan path: pytubefix + Groq Whisper ──────────────────────
113
+
114
+ def deep_scan_transcript(self, url: str) -> str:
115
+ """
116
+ Download the video's audio and transcribe it via Groq Whisper.
117
+
118
+ Uses pytubefix to download audio, pydub to chunk large files,
119
+ and Groq Whisper API for speech-to-text.
120
+
121
+ Raises
122
+ ------
123
+ RuntimeError
124
+ If download or transcription fails.
125
+ """
126
+ video_id = self._extract_video_id(url)
127
+ logger.info("🎙️ Deep Scan started for video ID: %s", video_id)
128
+
129
+ if not self._groq_key:
130
+ raise RuntimeError(
131
+ "GROQ_API_KEY is not set. Cannot perform deep scan."
132
+ )
133
+
134
+ groq_client = Groq(api_key=self._groq_key)
135
+
136
+ with tempfile.TemporaryDirectory() as tmpdir:
137
+ # Step 1: Download audio via pytubefix
138
+ audio_path = self._download_audio(url, tmpdir)
139
+ file_size = os.path.getsize(audio_path)
140
+ logger.info(
141
+ "📥 Audio downloaded: %s (%.1f MB)",
142
+ audio_path, file_size / (1024 * 1024),
143
+ )
144
+
145
+ # Step 2: Chunk if needed, then transcribe
146
+ if file_size <= _WHISPER_MAX_BYTES:
147
+ transcript = self._transcribe_file(groq_client, audio_path)
148
+ else:
149
+ transcript = self._transcribe_chunked(
150
+ groq_client, audio_path, tmpdir
151
+ )
152
+
153
+ if not transcript.strip():
154
+ raise RuntimeError(
155
+ f"Deep scan produced an empty transcript for {video_id}."
156
+ )
157
+
158
+ logger.info(
159
+ "✅ Deep Scan complete (%d chars)", len(transcript)
160
+ )
161
+ return transcript
162
+
163
+ def _download_audio(self, url: str, output_dir: str) -> str:
164
+ """Download audio-only stream via pytubefix."""
165
+ try:
166
+ from pytubefix import YouTube
167
+
168
+ clean_url = f"https://www.youtube.com/watch?v={self._extract_video_id(url)}"
169
+ yt = YouTube(clean_url)
170
+ stream = yt.streams.get_audio_only()
171
+
172
+ if stream is None:
173
+ raise RuntimeError("No audio stream available for this video.")
174
+
175
+ logger.info("⬇️ Downloading audio stream: %s", stream)
176
+ output_path = stream.download(output_path=output_dir)
177
+ return output_path
178
+ except Exception as e:
179
+ logger.error("❌ Audio download failed: %s", e)
180
+ raise RuntimeError(
181
+ f"Failed to download audio: {e}"
182
+ ) from e
183
+
184
+ def _transcribe_file(self, client: Groq, file_path: str) -> str:
185
+ """Transcribe a single audio file via Groq Whisper."""
186
+ logger.info("🎤 Transcribing file: %s", os.path.basename(file_path))
187
+ try:
188
+ with open(file_path, "rb") as f:
189
+ result = client.audio.transcriptions.create(
190
+ file=(os.path.basename(file_path), f.read()),
191
+ model=_WHISPER_MODEL,
192
+ response_format="text",
193
+ temperature=0.0,
194
+ )
195
+ return result if isinstance(result, str) else str(result)
196
+ except Exception as e:
197
+ logger.error("❌ Whisper transcription failed: %s", e)
198
+ raise RuntimeError(
199
+ f"Groq Whisper transcription failed: {e}"
200
+ ) from e
201
+
202
+ def _transcribe_chunked(
203
+ self, client: Groq, file_path: str, tmpdir: str
204
+ ) -> str:
205
+ """
206
+ Split a large audio file into chunks under 24 MB, transcribe each,
207
+ and concatenate the results.
208
+ """
209
+ logger.info("✂️ Audio file too large — splitting into chunks...")
210
+
211
+ # Load audio with pydub
212
+ audio = AudioSegment.from_file(file_path)
213
+ total_ms = len(audio)
214
+ file_size = os.path.getsize(file_path)
215
+
216
+ # Calculate chunk duration to stay under the size limit
217
+ # Ratio: (target bytes / total bytes) * total duration
218
+ ratio = _WHISPER_MAX_BYTES / file_size
219
+ chunk_duration_ms = int(total_ms * ratio * 0.9) # 10% safety margin
220
+ chunk_duration_ms = max(chunk_duration_ms, 60_000) # min 1 minute
221
+
222
+ chunks_text = []
223
+ chunk_index = 0
224
+ offset = 0
225
+
226
+ while offset < total_ms:
227
+ chunk_end = min(offset + chunk_duration_ms, total_ms)
228
+ chunk = audio[offset:chunk_end]
229
+ chunk_index += 1
230
+
231
+ chunk_path = os.path.join(tmpdir, f"chunk_{chunk_index}.mp3")
232
+ chunk.export(chunk_path, format="mp3", bitrate="64k")
233
+ chunk_size = os.path.getsize(chunk_path)
234
+
235
+ logger.info(
236
+ " 📦 Chunk %d: %d-%ds (%.1f MB)",
237
+ chunk_index,
238
+ offset // 1000,
239
+ chunk_end // 1000,
240
+ chunk_size / (1024 * 1024),
241
+ )
242
+
243
+ text = self._transcribe_file(client, chunk_path)
244
+ chunks_text.append(text)
245
+
246
+ offset = chunk_end
247
+
248
+ logger.info(
249
+ "✅ Transcribed %d chunks, total %d chars",
250
+ len(chunks_text),
251
+ sum(len(t) for t in chunks_text),
252
+ )
253
+ return " ".join(chunks_text)
254
+
255
+ # ── Helpers ───────────────────────────────────────────────────���───
256
+
257
+ def _extract_video_id(self, url: str) -> str:
258
+ """Extract the 11-character video ID from any YouTube URL format."""
259
+ match = re.search(
260
+ r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url)
261
+ )
262
+ return match.group(1) if match else "unknown"
263
+
264
+ def cleanup(self, path=None):
265
+ pass