Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

Ahmed Mostafa commited on 16 days ago

Commit

bc3aca1

2 Parent(s): bd357e4 c342700

feat: implement FastAPI structure, Supadata integration, and summarization schemas

Browse files

Files changed (16) hide show

Dockerfile +7 -26
main.py +2 -5
pyproject.toml +6 -2
requirements.txt +10 -1
run.py +12 -32
src/api/main.py +3 -10
src/api/notes_routes.py +154 -1
src/audio/__pycache__/__init__.cpython-312.pyc +0 -0
src/audio/__pycache__/__init__.cpython-314.pyc +0 -0
src/audio/__pycache__/downloader.cpython-312.pyc +0 -0
src/audio/__pycache__/downloader.cpython-314.pyc +0 -0
src/recommendation/recommender.py +187 -56
src/services/__pycache__/categorizer.cpython-312.pyc +0 -0
src/summarization/note_generator.py +428 -19
src/summarization/schemas.py +9 -0
src/transcription/downloader.py +265 -0

Dockerfile CHANGED Viewed

@@ -1,44 +1,25 @@
 # 1. اختيار النسخة الأساسية
 FROM python:3.10-slim
-# 2. تسطيب برامج النظام (ffmpeg للتعامل مع الصوت و curl لتحميل Node.js)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
     curl \
-    git \
     && rm -rf /var/lib/apt/lists/*
-# 3. تسطيب Node.js 20 (مهم جداً عشان حل شفرات يوتيوب)
-RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
-    && apt-get install -y nodejs \
-    && rm -rf /var/lib/apt/lists/*
-# 4. تجهيز فولدر المشروع
 WORKDIR /app
-# 5. تسطيب مكتبات بايثون
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# 6. تحميل وبناء سيرفر bgutil (خبير فك الشفرات)
-ARG BGUTIL_VERSION=1.3.1
-RUN git clone --depth 1 --branch ${BGUTIL_VERSION} \
-        https://github.com/Brainicism/bgutil-ytdlp-pot-provider.git \
-        /opt/bgutil-provider \
-    && cd /opt/bgutil-provider/server \
-    && npm ci \
-    && npx tsc \
-    && echo "✅ bgutil POT server compiled successfully"
-# 7. تسطيب الـ Plugin اللي بيربط yt-dlp بالسيرفر اللي فوق
-RUN pip install --no-cache-dir "bgutil-ytdlp-pot-provider==${BGUTIL_VERSION}"
-# 8. نسخ باقي ملفات المشروع
 COPY . .
-# 9. تضبيط الصلاحيات عشان Hugging Face (مهم جداً عشان السيرفر ميدي لكش Error)
-RUN chown -R 1000:1000 /app /opt/bgutil-provider
 USER 1000
-# 10. أمر تشغيل السيرفر الأساسي
 CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

 # 1. اختيار النسخة الأساسية
 FROM python:3.10-slim
+# 2. تسطيب برامج النظام (ffmpeg للتعامل مع الصوت)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
     curl \
     && rm -rf /var/lib/apt/lists/*
+# 3. تجهيز فولدر المشروع
 WORKDIR /app
+# 4. تسطيب مكتبات بايثون
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# 5. نسخ باقي ملفات المشروع
 COPY . .
+# 6. تضبيط الصلاحيات عشان Hugging Face (مهم جداً عشان السيرفر ميدي لكش Error)
+RUN chown -R 1000:1000 /app
 USER 1000
+# 7. أمر تشغيل السيرفر الأساسي
 CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py CHANGED Viewed

@@ -1,16 +1,13 @@
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
-from src.api.pot_server import pot_server # استدعاء المدير اللي عملناه
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # الجزء ده بيتنفذ أول ما السيرفر يفتح
-    print("🚀 Starting POT solver server...")
-    pot_server.start()
     yield
     # الجزء ده بيتنفذ لما السيرفر يقفل
-    print("🛑 Stopping POT solver server...")
-    pot_server.stop()
 # تعريف الـ app مع إضافة الـ lifespan
 app = FastAPI(lifespan=lifespan)

 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # الجزء ده بيتنفذ أول ما السيرفر يفتح
+    print("🚀 AIdea API starting up...")
     yield
     # الجزء ده بيتنفذ لما السيرفر يقفل
+    print("🛑 AIdea API shutting down...")
 # تعريف الـ app مع إضافة الـ lifespan
 app = FastAPI(lifespan=lifespan)

pyproject.toml CHANGED Viewed

@@ -6,7 +6,6 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "aiofiles==23.2.1",
-    "assemblyai>=0.30.0",
     "asyncpg==0.31.0",
     "bcrypt==4.1.2",
     "email-validator>=2.3.0",
@@ -32,5 +31,10 @@ dependencies = [
     "torch>=2.10.0",
     "torchaudio>=2.10.0",
     "uvicorn[standard]==0.27.0",
-    "yt-dlp==2024.12.23",
 ]

 requires-python = ">=3.10"
 dependencies = [
     "aiofiles==23.2.1",
     "asyncpg==0.31.0",
     "bcrypt==4.1.2",
     "email-validator>=2.3.0",
     "torch>=2.10.0",
     "torchaudio>=2.10.0",
     "uvicorn[standard]==0.27.0",
 ]
+[tool.pyright]
+# The project uses `src.xxx` imports resolved from the repo root,
+# NOT from inside `src/`.  Tell Pyright to add "." as an extra
+# search path so it finds `src/` as a package.
+extraPaths = ["."]

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
 assemblyai>=0.30.0
 yt-dlp>=2025.05.22
@@ -6,6 +7,9 @@ youtube-transcript-api==0.6.2
 curl_cffi
 # --- AI, LLMs & Transcription Fallback ---
 openai-whisper==20250625
 torch
 torchaudio
@@ -41,4 +45,9 @@ firebase-admin==6.5.0
 dnspython
 pydub==0.25.1
 ffmpeg-python
-groq>=0.9.0

+<<<<<<< HEAD
 # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
 assemblyai>=0.30.0
 yt-dlp>=2025.05.22
 curl_cffi
 # --- AI, LLMs & Transcription Fallback ---
+=======
+# --- AI, LLMs & Transcription ---
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
 openai-whisper==20250625
 torch
 torchaudio
 dnspython
 pydub==0.25.1
 ffmpeg-python
+groq>=0.9.0
+pytubefix
+# --- ML & Recommendations ---
+# keybert
+# sentence-transformers

run.py CHANGED Viewed

@@ -17,25 +17,7 @@ logger = setup_logger(__name__)
 def check_environment():
     """Log key dependency versions to confirm runtime environment."""
-    # Check Node.js
-    try:
-        node_version = subprocess.check_output(
-            ["node", "--version"], stderr=subprocess.STDOUT
-        ).decode().strip()
-        logger.info(f"✅ Node.js available: {node_version} — yt-dlp JS challenges will be solved")
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        logger.warning("❌ Node.js NOT found — yt-dlp will fail to solve JS challenges. Add 'nodejs' to Dockerfile.")
-    # Check yt-dlp
-    try:
-        ytdlp_version = subprocess.check_output(
-            ["yt-dlp", "--version"], stderr=subprocess.STDOUT
-        ).decode().strip()
-        logger.info(f"✅ yt-dlp version: {ytdlp_version}")
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        logger.warning("❌ yt-dlp not found in PATH")
-    # Check ffmpeg
     try:
         ffmpeg_out = subprocess.check_output(
             ["ffmpeg", "-version"], stderr=subprocess.STDOUT
@@ -44,6 +26,13 @@ def check_environment():
     except (subprocess.CalledProcessError, FileNotFoundError):
         logger.warning("❌ ffmpeg NOT found — audio extraction will fail")
 def run_server():
     """Start the FastAPI server with CORS enabled for Flutter Web."""
@@ -71,22 +60,13 @@ def run_server():
 def run_cli(youtube_url: str, output_file: str = None):
-    from src.api.pot_server import pot_server
-    from src.api.downloader import YouTubeDownloader
-    # ... باقي الـ imports
     check_environment()
-    # تشغيل خبير الشفرات قبل التحميل
-    pot_server.start()
-    try:
-        # كود التحميل بتاعك هنا
-        downloader = YouTubeDownloader()
-        # ...
-    finally:
-        # قفل السيرفر بعد ما يخلص
-        pot_server.stop()
 def main():

 def check_environment():
     """Log key dependency versions to confirm runtime environment."""
+    # Check ffmpeg (still used by audio processing utilities)
     try:
         ffmpeg_out = subprocess.check_output(
             ["ffmpeg", "-version"], stderr=subprocess.STDOUT
     except (subprocess.CalledProcessError, FileNotFoundError):
         logger.warning("❌ ffmpeg NOT found — audio extraction will fail")
+    # Verify Supadata API key is configured
+    supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
+    if supadata_key:
+        logger.info("✅ SUPADATA_API_KEY is set")
+    else:
+        logger.warning("❌ SUPADATA_API_KEY is NOT set — transcript extraction will fail")
 def run_server():
     """Start the FastAPI server with CORS enabled for Flutter Web."""
 def run_cli(youtube_url: str, output_file: str = None):
+    from src.transcription.downloader import YouTubeDownloader
     check_environment()
+    downloader = YouTubeDownloader()
+    transcript = downloader.get_transcript(youtube_url)
+    print(transcript)
 def main():

src/api/main.py CHANGED Viewed

@@ -4,8 +4,6 @@ from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-# POT Server and Routers
-from src.api.pot_server import pot_server
 from src.api.auth_routes import router as auth_router
 from src.api.notes_routes import router as notes_router
 from src.api.recommendation_routes import router as recommendation_router
@@ -16,11 +14,9 @@ logger = setup_logger(__name__)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    print("🚀 Lifespan: Starting POT solver server (bgutil v1.3.1)...")
-    pot_server.start()
     yield
-    print("🛑 Lifespan: Stopping POT solver server...")
-    pot_server.stop()
 app = FastAPI(
     title="AIdea API",
@@ -45,7 +41,6 @@ def read_root():
     return {
         "status": "online",
         "message": "Welcome to AIdea API! Everything is working perfectly.",
-        "pot_server": "running"
     }
 @app.get("/health")
@@ -86,11 +81,9 @@ async def health_check():
                 connectivity[url] = f"Failed: {repr(e)}"
     return {
-        "status": "v6-online",
         "dnspython": has_dnspython,
         "dns": dns_results,
         "connectivity": connectivity,
-        "pot_running": pot_server.is_running(),
         "timestamp": datetime.now()
     }

 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from src.api.auth_routes import router as auth_router
 from src.api.notes_routes import router as notes_router
 from src.api.recommendation_routes import router as recommendation_router
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    logger.info("🚀 AIdea API starting up...")
     yield
+    logger.info("🛑 AIdea API shutting down...")
 app = FastAPI(
     title="AIdea API",
     return {
         "status": "online",
         "message": "Welcome to AIdea API! Everything is working perfectly.",
     }
 @app.get("/health")
                 connectivity[url] = f"Failed: {repr(e)}"
     return {
+        "status": "v7-supadata-only",
         "dnspython": has_dnspython,
         "dns": dns_results,
         "connectivity": connectivity,
         "timestamp": datetime.now()
     }

src/api/notes_routes.py CHANGED Viewed

@@ -11,9 +11,20 @@ from typing import Dict, List
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from pydantic import BaseModel, HttpUrl
 from src.api.downloader import YouTubeDownloader
 from src.auth.dependencies import get_current_user
 from src.db.models import User
 from src.summarization.note_generator import NoteGenerator
 from src.transcription.whisper_transcriber import WhisperTranscriber
 from src.utils.config import settings
@@ -25,10 +36,16 @@ router = APIRouter(tags=["Notes"])
 tasks: Dict[str, Dict] = {}
 def _set_task_status(task_id: str, status: str, message: str) -> None:
     tasks[task_id]["status"] = status
     tasks[task_id]["message"] = message
 def _extract_video_id(url: str) -> str:
     """Extract the 11-character YouTube video ID from any URL format."""
@@ -36,6 +53,7 @@ def _extract_video_id(url: str) -> str:
     return match.group(1) if match else ""
 def _use_supadata_first_strategy() -> bool:
     return settings.youtube_transcript_strategy == "supadata_first"
@@ -50,12 +68,46 @@ def _is_fast_fail_ssl_error(exc: Exception) -> bool:
             "EOF occurred in violation of protocol",
         )
     )
 def _duration_via_supadata(video_id: str) -> int:
     """
     Strategy 2: use Supadata transcript segments and estimate duration from the
     last segment timestamp.
     """
     api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
     if not api_key:
@@ -66,6 +118,7 @@ def _duration_via_supadata(video_id: str) -> int:
             f"https://api.supadata.ai/v1/youtube/transcript"
             f"?url=https://www.youtube.com/watch?v={video_id}"
         )
         resp = curl_requests.get(
             api_url,
             headers={
@@ -149,6 +202,30 @@ def _duration_via_html_scrape(url: str) -> int:
         except (json.JSONDecodeError, AttributeError) as exc:
             logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
     return 0
@@ -158,6 +235,7 @@ def get_youtube_duration(
     strategy: str | None = None,
 ) -> int:
     """
     Robustly fetch the YouTube video duration in seconds using a waterfall (Supadata -> Scraping).
     """
     video_id = _extract_video_id(url)
@@ -175,12 +253,23 @@ def get_youtube_duration(
         return duration
     logger.warning("[duration] All strategies exhausted for: %s", url)
     return 0
 class GenerateNotesRequest(BaseModel):
     youtube_url: HttpUrl
     language: str = "en"
 class TaskResponse(BaseModel):
@@ -220,6 +309,10 @@ async def generate_note(
         str(request.youtube_url),
         request.language,
         user_id,
     )
     return TaskResponse(
@@ -236,10 +329,17 @@ async def get_task_status(task_id: str):
     return tasks[task_id]
-async def process_video_task(task_id: str, youtube_url: str, language: str, user_id: str):
     downloader = YouTubeDownloader()
     try:
         video_id = _extract_video_id(youtube_url)
         video_title = "YouTube Video"
@@ -280,6 +380,30 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
             "ai_processing",
             "Generating intelligent summary...",
         )
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
         resolved_video_title = video_title
@@ -305,6 +429,10 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
             if isinstance(seg, dict) and seg.get("key_insight")
         ]
         from src.summarization.topic_classifier import classify_topics
         _set_task_status(
@@ -315,11 +443,17 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
         raw_topics = summary_json.get("topics", [])
         categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
         _set_task_status(task_id, "complete", "Generation completed successfully.")
         tasks[task_id]["notes"] = final_markdown
         tasks[task_id]["topics"] = categories
         tasks[task_id]["category"] = categories
         tasks[task_id]["keyPoints"] = key_points_list
         tasks[task_id]["videoTitle"] = resolved_video_title
         tasks[task_id]["thumbnail"] = (
             f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
@@ -370,6 +504,25 @@ def _transcribe_audio_fallback(
     finally:
         if audio_path is not None:
             downloader.cleanup(audio_path)
 @router.get("/generated", response_model=List[GeneratedNoteFile])

 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from pydantic import BaseModel, HttpUrl
+<<<<<<< HEAD
 from src.api.downloader import YouTubeDownloader
 from src.auth.dependencies import get_current_user
 from src.db.models import User
+=======
+from src.db.firebase import get_firebase_db
+from src.db.models import User, Note
+from src.auth.dependencies import get_current_user
+from src.utils.logger import setup_logger
+from src.utils.config import settings
+# --- استدعاء أدوات المعالجة (النسخة الجديدة) ---
+from src.transcription.downloader import YouTubeDownloader, NoTranscriptError
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
 from src.summarization.note_generator import NoteGenerator
 from src.transcription.whisper_transcriber import WhisperTranscriber
 from src.utils.config import settings
 tasks: Dict[str, Dict] = {}
+<<<<<<< HEAD
 def _set_task_status(task_id: str, status: str, message: str) -> None:
     tasks[task_id]["status"] = status
     tasks[task_id]["message"] = message
+=======
+# ==========================================
+# ⏱️ YouTube Duration & Metadata (Supadata-only)
+# ==========================================
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
 def _extract_video_id(url: str) -> str:
     """Extract the 11-character YouTube video ID from any URL format."""
     return match.group(1) if match else ""
+<<<<<<< HEAD
 def _use_supadata_first_strategy() -> bool:
     return settings.youtube_transcript_strategy == "supadata_first"
             "EOF occurred in violation of protocol",
         )
     )
+=======
+def _fetch_video_title(url: str) -> str:
+    """
+    Fetch the real video title via YouTube's oEmbed API.
+    Falls back to 'YouTube Video' on any failure.
+    """
+    try:
+        oembed_url = (
+            f"https://www.youtube.com/oembed"
+            f"?url={url}&format=json"
+        )
+        req = urllib.request.Request(oembed_url, headers={
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/124.0.0.0 Safari/537.36"
+            ),
+        })
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            data = json.loads(resp.read())
+            title = data.get("title", "").strip()
+            if title:
+                logger.info("✅ Fetched video title via oEmbed: %s", title)
+                return title
+    except Exception as e:
+        logger.warning("⚠️ oEmbed title fetch failed, using fallback: %s", e)
+    return "YouTube Video"
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
 def _duration_via_supadata(video_id: str) -> int:
     """
+<<<<<<< HEAD
     Strategy 2: use Supadata transcript segments and estimate duration from the
     last segment timestamp.
+=======
+    Fetch approximate video duration via the Supadata transcript API.
+    The last segment's offset gives a close approximation of the duration.
+    Returns duration in seconds, or 0 on failure.
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
     """
     api_key = os.environ.get("SUPADATA_API_KEY", "").strip()
     if not api_key:
             f"https://api.supadata.ai/v1/youtube/transcript"
             f"?url=https://www.youtube.com/watch?v={video_id}"
         )
+<<<<<<< HEAD
         resp = curl_requests.get(
             api_url,
             headers={
         except (json.JSONDecodeError, AttributeError) as exc:
             logger.warning("[S3c-jsonParse] JSON decode failed: %s", exc)
+=======
+        req = urllib.request.Request(api_url, headers={
+            "x-api-key": api_key,
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/124.0.0.0 Safari/537.36"
+            ),
+        })
+        with urllib.request.urlopen(req, timeout=20) as resp:
+            data = json.loads(resp.read())
+            # Supadata returns segments with "offset" in ms — last one ≈ total duration
+            segments = data.get("segments") or data.get("content", [])
+            if isinstance(segments, list) and segments:
+                last = segments[-1]
+                offset_ms = last.get("offset", 0) or last.get("start", 0)
+                dur_ms = last.get("duration", 0) or last.get("dur", 0)
+                total_s = (int(offset_ms) + int(dur_ms)) // 1000
+                if total_s > 0:
+                    logger.info("⏱️ [supadata] duration≈%ds", total_s)
+                    return total_s
+    except Exception as e:
+        logger.warning("⚠️ [supadata] duration fetch failed: %s", e)
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
     return 0
     strategy: str | None = None,
 ) -> int:
     """
+<<<<<<< HEAD
     Robustly fetch the YouTube video duration in seconds using a waterfall (Supadata -> Scraping).
     """
     video_id = _extract_video_id(url)
         return duration
     logger.warning("[duration] All strategies exhausted for: %s", url)
+=======
+    Fetch the YouTube video duration in seconds via Supadata.
+    Returns 0 if the duration cannot be determined.
+    """
+    video_id = _extract_video_id(url)
+    if video_id:
+        return _duration_via_supadata(video_id)
+    logger.warning("⚠️ [duration] Could not extract video ID from: %s", url)
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
     return 0
 class GenerateNotesRequest(BaseModel):
     youtube_url: HttpUrl
     language: str = "en"
+    deep_scan: bool = False
 class TaskResponse(BaseModel):
         str(request.youtube_url),
         request.language,
         user_id,
+<<<<<<< HEAD
+=======
+        request.deep_scan,
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
     )
     return TaskResponse(
     return tasks[task_id]
+async def process_video_task(
+    task_id: str,
+    youtube_url: str,
+    language: str,
+    user_id: str,
+    deep_scan: bool = False,
+):
     downloader = YouTubeDownloader()
     try:
+<<<<<<< HEAD
         video_id = _extract_video_id(youtube_url)
         video_title = "YouTube Video"
             "ai_processing",
             "Generating intelligent summary...",
         )
+=======
+        # Extract video ID for thumbnail
+        video_id_match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", youtube_url)
+        video_id = video_id_match.group(1) if video_id_match else ""
+        # Fetch real video title via YouTube oEmbed API
+        video_title = _fetch_video_title(youtube_url)
+        # ── TRANSCRIPT EXTRACTION ───────────────────────────────────
+        if deep_scan:
+            # Deep Scan: download audio → Groq Whisper
+            tasks[task_id]["status"] = "transcribing"
+            tasks[task_id]["message"] = "Deep Scan: downloading audio..."
+            transcript_text = downloader.deep_scan_transcript(youtube_url)
+        else:
+            # Default: fast Supadata subtitle extraction
+            tasks[task_id]["status"] = "transcribing"
+            tasks[task_id]["message"] = "Fetching transcript via Supadata..."
+            transcript_text = downloader.get_transcript(youtube_url)
+        # ── AI SUMMARIZATION ────────────────────────────────────────
+        tasks[task_id]["status"] = "generating_notes"
+        tasks[task_id]["message"] = "AI is generating your notes..."
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
         resolved_video_title = video_title
             if isinstance(seg, dict) and seg.get("key_insight")
         ]
+<<<<<<< HEAD
+=======
+        # ── CATEGORIZATION ──────────────────────────────────────────
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
         from src.summarization.topic_classifier import classify_topics
         _set_task_status(
         raw_topics = summary_json.get("topics", [])
         categories = classify_topics(raw_topics) if raw_topics else ["Education & Science"]
+<<<<<<< HEAD
         _set_task_status(task_id, "complete", "Generation completed successfully.")
+=======
+        # ── RETURN RESULTS ──────────────────────────────────────────
+        tasks[task_id]["status"] = "completed"
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
         tasks[task_id]["notes"] = final_markdown
         tasks[task_id]["topics"] = categories
         tasks[task_id]["category"] = categories
         tasks[task_id]["keyPoints"] = key_points_list
+<<<<<<< HEAD
         tasks[task_id]["videoTitle"] = resolved_video_title
         tasks[task_id]["thumbnail"] = (
             f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg" if video_id else ""
     finally:
         if audio_path is not None:
             downloader.cleanup(audio_path)
+=======
+        tasks[task_id]["suggestedCategory"] = summary_json.get("suggested_category", "")
+        logger.info("✅ Task %s completed successfully!", task_id)
+    except NoTranscriptError as e:
+        # Video has no subtitles — signal the frontend to offer Deep Scan
+        logger.warning("⚠️ Task %s: no transcript available — %s", task_id, e)
+        tasks[task_id]["status"] = "failed"
+        tasks[task_id]["error_code"] = "NO_TRANSCRIPT"
+        tasks[task_id]["message"] = (
+            "This video does not have subtitles. "
+            "Use Deep Scan to extract text from the audio."
+        )
+    except Exception as e:
+        logger.error("❌ Task %s failed: %s", task_id, e)
+        tasks[task_id]["status"] = "failed"
+        tasks[task_id]["message"] = str(e)
+>>>>>>> c34270025bb017af990e7cf5ae0f19dfed0aaaf0
 @router.get("/generated", response_model=List[GeneratedNoteFile])

src/audio/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (168 Bytes)

src/audio/__pycache__/__init__.cpython-314.pyc DELETED Viewed

Binary file (170 Bytes)

src/audio/__pycache__/downloader.cpython-312.pyc DELETED Viewed

Binary file (7.2 kB)

src/audio/__pycache__/downloader.cpython-314.pyc DELETED Viewed

Binary file (8.06 kB)

src/recommendation/recommender.py CHANGED Viewed

@@ -1,86 +1,146 @@
 import asyncio
 from typing import List, Dict, Optional
 from googleapiclient.discovery import build
-from src import db
 from src.utils.logger import setup_logger
 import random
-import os
-from dotenv import load_dotenv
-load_dotenv()
 logger = setup_logger(__name__)
 class RecommendationService:
     """
     Service for suggesting videos based on user's saved notes.
-    Uses YouTube Search API for recommendations.
     """
     def __init__(self, api_key: Optional[str] = None):
         self.api_key = "AIzaSyA3erB-Lxd5SOoBOXaumOCVaEr3TcgYG60"
         self.youtube = build("youtube", "v3", developerKey=self.api_key)
-    async def get_recommendations_for_user(
-        self, db, user_id: str, limit: int = 5
-    ) -> List[Dict]:
         """
-        Get recommendations based on user's note history in Firebase.
         """
-        logger.info(f"📚 Fetching notes for user: {user_id}")
         try:
-            notes_ref = (
-                db.collection("notes")
-                .where("userId", "==", user_id)
-                .limit(10)
             )
-            notes_docs = notes_ref.stream()
-            notes = sorted(
-                [doc.to_dict() for doc in notes_docs],
-                key=lambda x: x.get("createdAt", 0),
-                reverse=True
-            )[:5]
-            logger.info(f"📝 Found {len(notes)} notes for user")
         except Exception as e:
-            logger.error(f"❌ Failed to fetch notes from Firebase: {e}")
-            notes = []
-        if not notes:
-            logger.info("⚠️ No notes found, returning general recommendations")
-            return await self.get_youtube_recommendations("educational tutorials", limit)
-        # Extract topics from note categories
-        topics = []
-        for n in notes[:5]:
-            cat = n.get("category")
-            if not cat:
-                continue
-            # check if cat is a list or a string
-            if isinstance(cat, list):
-                topics.extend([c for c in cat if c and c != "Uncategorized"])
-            elif cat != "Uncategorized":
-                topics.append(cat)
-        if not topics:
-            topics = [n.get("videoTitle", "") for n in notes[:3]]
-        search_query = " ".join(topics[:3])
-        logger.info(f"🔍 Search query built: {search_query}")
-        return await self.get_youtube_recommendations(search_query, limit)
     async def get_youtube_recommendations(
         self, query: str, limit: int = 5
     ) -> List[Dict]:
-        """
-        Search YouTube for videos based on a query.
-        """
         if not query:
             return []
-        enhanced_query = f"{query} educational lecture tutorial"
-        logger.info(f"🎬 Searching YouTube for: {enhanced_query}")
         try:
             loop = asyncio.get_event_loop()
@@ -90,10 +150,11 @@ class RecommendationService:
                 .list(
                     q=enhanced_query,
                     part="snippet",
-                    maxResults=limit*3,  # fetch more to filter later
                     type="video",
                     relevanceLanguage="en",
                     videoEmbeddable="true",
                 )
                 .execute(),
             )
@@ -112,12 +173,82 @@ class RecommendationService:
                         "type": "youtube_video",
                     }
                 )
-                logger.info(f"✅ Found video: {snippet['title']}")
-            logger.info(f"🚀 Total videos fetched: {len(videos)}")
             random.shuffle(videos)
-            return videos[:limit]
         except Exception as e:
             logger.error(f"❌ YouTube search failed: {e}")
-            return []

 import asyncio
+from collections import Counter
 from typing import List, Dict, Optional
 from googleapiclient.discovery import build
 from src.utils.logger import setup_logger
 import random
+# import anthropic
+from groq import Groq
 logger = setup_logger(__name__)
 class RecommendationService:
     """
     Service for suggesting videos based on user's saved notes.
+    Pipeline:
+      1. Top 3 most-repeated categories across all user notes
+      2. Extract key keywords from the latest note per category (via Claude)
+      3. Build a YouTube search query and return recommendations
     """
     def __init__(self, api_key: Optional[str] = None):
         self.api_key = "AIzaSyA3erB-Lxd5SOoBOXaumOCVaEr3TcgYG60"
         self.youtube = build("youtube", "v3", developerKey=self.api_key)
+        self.groq_client = Groq(api_key="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")  # هنا
+    # ──────────────────────────────────────────────
+    # Step 1: top 3 categories
+    # ──────────────────────────────────────────────
+    def _get_top_categories(self, notes: List[Dict], top_n: int = 3) -> List[str]:
+        """Count category frequency across all notes and return the top N."""
+        counter: Counter = Counter()
+        for note in notes:
+            cat = note.get("category")
+            if not cat:
+                continue
+            cats = cat if isinstance(cat, list) else [cat]
+            for c in cats:
+                if c and c != "Uncategorized":
+                    counter[c] += 1
+        top = [cat for cat, _ in counter.most_common(top_n)]
+        logger.info(f"🏆 Top categories: {top}")
+        return top
+    # ──────────────────────────────────────────────
+    # Step 2: keywords from latest note per category
+    # ──────────────────────────────────────────────
+    def _latest_notes_per_category(
+        self, notes: List[Dict], categories: List[str], top_n: int = 2
+    ) -> Dict[str, List[Dict]]:
         """
+        return a dict mapping each category to its latest N notes, sorted by createdAt.
         """
+        buckets: Dict[str, List[Dict]] = {cat: [] for cat in categories}
+        for note in notes:
+            cat = note.get("category")
+            cats = cat if isinstance(cat, list) else [cat] if cat else []
+            for c in cats:
+                if c in buckets:
+                    buckets[c].append(note)
+        # sort each category's notes by createdAt and keep top N
+        return {
+            cat: sorted(notes_list, key=lambda n: n.get("createdAt", 0), reverse=True)[:top_n]
+            for cat, notes_list in buckets.items()
+        }
+    async def _extract_keywords_with_claude(
+        self, notes: List[Dict], category: str  # ← List بدل Dict
+    ) -> List[str]:
+        # combine all relevant text fields from the notes into one string for context
+        combined_content = "\n---\n".join([
+            note.get("content") or note.get("text") or note.get("videoTitle") or ""
+            for note in notes
+        ]).strip()
+        if not combined_content:
+            return [category]
+        prompt = (
+            f"You are a search-query assistant. "
+            f"Given the notes below (category: {category}), "
+            f"extract 3 to 5 concise English keywords or short phrases that best "
+            f"represent the core topic for a YouTube educational search. "
+            f"Reply with ONLY a JSON array of strings, no explanation.\n\n"
+            f"Notes:\n{combined_content[:2000]}"  # ← زودي الحد شوية
+        )
         try:
+            loop = asyncio.get_event_loop()
+            # groq_client = Groq(api_key="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")
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.groq_client.chat.completions.create(
+                    model="llama-3.3-70b-versatile",
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=120,
+                )
             )
+            raw = response.choices[0].message.content.strip()
+            import json, re
+            # strip accidental markdown fences
+            raw = re.sub(r"```json|```", "", raw).strip()
+            keywords = json.loads(raw)
+            if isinstance(keywords, list):
+                logger.info(f"🔑 Keywords for '{category}': {keywords}")
+                return [str(k) for k in keywords[:5]]
         except Exception as e:
+            logger.warning(f"⚠️ Claude keyword extraction failed for '{category}': {e}")
+        return [category]  # fallback
+    # ──────────────────────────────────────────────
+    # Step 3: build query & search YouTube
+    # ──────────────────────────────────────────────
+    async def _build_search_query(
+        self, category_keywords: Dict[str, List[str]]
+    ) -> str:
+        """
+        Merge keywords from each top category into one balanced search query.
+        Takes up to 2 keywords per category to keep the query focused.
+        """
+        parts = []
+        for keywords in category_keywords.values():
+            parts.extend(keywords[:2])
+        query = " OR ".join(parts[:6])  # YouTube search works best under ~60 chars
+        logger.info(f"🔍 Final search query: {query}")
+        return query
     async def get_youtube_recommendations(
         self, query: str, limit: int = 5
     ) -> List[Dict]:
+        """Search YouTube for educational videos matching the query."""
         if not query:
             return []
+        enhanced_query = f"{query} tutorial "
+        logger.info(f"🎬 Searching YouTube: {enhanced_query}")
         try:
             loop = asyncio.get_event_loop()
                 .list(
                     q=enhanced_query,
                     part="snippet",
+                    maxResults=limit * 3,
                     type="video",
                     relevanceLanguage="en",
                     videoEmbeddable="true",
+                    videoDuration="medium",
                 )
                 .execute(),
             )
                         "type": "youtube_video",
                     }
                 )
             random.shuffle(videos)
+            result = videos[:limit]
+            logger.info(f"✅ Returning {len(result)} recommendations")
+            return result
         except Exception as e:
             logger.error(f"❌ YouTube search failed: {e}")
+            return []
+    # ──────────────────────────────────────────────
+    # Main entry point
+    # ──────────────────────────────────────────────
+    async def get_recommendations_for_user(
+        self, db, user_id: str, limit: int = 5
+    ) -> List[Dict]:
+        logger.info(f"📚 Fetching notes for user: {user_id}")
+        # ── Fetch notes ──────────────────────────
+        try:
+            notes_docs = (
+                db.collection("notes")
+                .where("userId", "==", user_id)
+                .stream()
+            )
+            notes = [doc.to_dict() for doc in notes_docs]
+            logger.info(f"📝 Found {len(notes)} notes")
+        except Exception as e:
+            logger.error(f"❌ Firebase fetch failed: {e}")
+            notes = []
+        if not notes:
+            logger.info("⚠️ No notes — falling back to general recommendations")
+            return await self.get_youtube_recommendations("educational tutorials", limit)
+        # ── Step 1: top 3 categories ─────────────
+        top_categories = self._get_top_categories(notes, top_n=3)
+        if not top_categories:
+            logger.info("⚠️ No valid categories — falling back")
+            return await self.get_youtube_recommendations("educational tutorials", limit)
+        # ── Step 2: keywords via Claude ──────────
+        latest_notes = self._latest_notes_per_category(notes, top_categories, top_n=2)
+        valid_categories = [
+            cat for cat in top_categories
+            if cat in latest_notes and latest_notes[cat]
+        ]
+        keyword_tasks = [
+            self._extract_keywords_with_claude(latest_notes[cat], cat)
+            for cat in valid_categories
+        ]
+        keyword_results = await asyncio.gather(*keyword_tasks)
+        category_keywords: Dict[str, List[str]] = {
+            cat: kws
+            for cat, kws in zip(valid_categories, keyword_results)  # ✅ zip على نفس الليست
+        }
+        # ── Step 3: build query & recommend ──────
+        all_videos = []
+        for category, keywords in category_keywords.items():
+            query = " ".join(keywords[:3])
+            logger.info(f"🎯 Searching category: {category} | Query: {query}")
+            videos = await self.get_youtube_recommendations(query, limit=2)
+            for v in videos:
+                v["category"] = category
+            all_videos.extend(videos)
+        random.shuffle(all_videos)
+        return all_videos[:limit * 2]

src/services/__pycache__/categorizer.cpython-312.pyc DELETED Viewed

Binary file (2.53 kB)

src/summarization/note_generator.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import os
-from typing import Dict, Optional
 from groq import Groq
 from pydantic import ValidationError
@@ -13,7 +15,27 @@ logger = setup_logger(__name__)
 # ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
@@ -24,7 +46,7 @@ LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
 - Detect the primary language of the transcript.
 - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
-- Only the "detected_language" field itself is stated in English (e.g. "Arabic").
 TIMELINE RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
@@ -42,6 +64,12 @@ TOPICS RULE:
 - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
 - Do NOT use generic fixed categories.
 CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
@@ -57,7 +85,8 @@ DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
         }
     ],
     "conclusion": "Final overall takeaway / closing conclusion",
-    "topics": ["Topic1", "Topic2", "Topic3"]
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -76,6 +105,109 @@ Return ONLY the exact JSON structure requested.
 """.strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
@@ -105,23 +237,121 @@ def _labels(language: str) -> dict:
     return _LABELS.get(language, _LABELS["English"])
 # ─────────────────────────────────────────────────────────────────────────────
 # NOTE GENERATOR
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
-    """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
-        self.model_id = "llama-3.3-70b-versatile"
-        logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
-    def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
         try:
             response = self.client.chat.completions.create(
-                model=self.model_id,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
@@ -132,9 +362,11 @@ class NoteGenerator:
             )
             return response.choices[0].message.content
         except Exception as e:
-            logger.error(f"❌ Groq API call failed: {e}")
             return None
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
@@ -143,31 +375,208 @@ class NoteGenerator:
             "segments": [],
             "conclusion": "",
             "topics": [],
         }
-    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
-        """Generate structured JSON summary from transcript."""
-        if not self.client:
-            return self._get_error_json("Groq API Key missing.")
-        logger.info(f"📝 Summary generation started via {self.model_id}")
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
-            transcript=transcript_text[:30000],
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
-            return self._get_error_json("Groq API call failed.")
         try:
-            data = json.loads(raw)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
-            logger.error(f"❌ Schema validation failed: {e}")
             return self._get_error_json(f"Validation Error: {str(e)}")
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")

 import json
 import os
+import re
+import time
+from typing import Dict, List, Optional
 from groq import Groq
 from pydantic import ValidationError
 # ─────────────────────────────────────────────────────────────────────────────
+# CONFIGURATION
+# ─────────────────────────────────────────────────────────────────────────────
+# Token threshold: below this, a single API call is used.
+_SINGLE_PASS_TOKEN_LIMIT = 8_000
+# Target chunk size for MAP phase (tokens).  Kept small so that
+# prompt + chunk + response stays well under the 12K TPM free-tier limit.
+_CHUNK_TARGET_TOKENS = 2_500
+# Model — unified for both MAP and REDUCE phases.
+# llama-3.3-70b-versatile has 12K TPM on the free tier (the highest).
+_MODEL_PRIMARY = "llama-3.3-70b-versatile"
+# Maximum retries when a rate-limit (413 / 429) is hit.
+_RATE_LIMIT_MAX_RETRIES = 3
+_RATE_LIMIT_SLEEP_SECONDS = 60
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — SINGLE-PASS (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
 - Detect the primary language of the transcript.
 - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
+- Only the "detected_language" and "suggested_category" fields are stated in English.
 TIMELINE RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
 - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
 - Do NOT use generic fixed categories.
+CATEGORY RULE:
+- Provide a single, concise category label (1-2 words max) in English.
+- This should be the most accurate high-level category for the video content.
+- Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
+- The suggested_category MUST always be in English regardless of the transcript language.
 CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
         }
     ],
     "conclusion": "Final overall takeaway / closing conclusion",
+    "topics": ["Topic1", "Topic2", "Topic3"],
+    "suggested_category": "Programming"
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 """.strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — MAP PHASE
+# ─────────────────────────────────────────────────────────────────────────────
+_MAP_SYSTEM = """
+You are an expert educational content analyst.
+You will receive ONE CHUNK of a longer video transcript.
+Extract the key information from this chunk ONLY.
+LANGUAGE RULE — CRITICAL:
+- Detect the primary language of the text.
+- Write ALL content fields in that SAME detected language.
+- Only "detected_language" is stated in English.
+Return a JSON object with this EXACT structure:
+{
+    "detected_language": "English (or Arabic, etc.)",
+    "chunk_summary": "Concise summary of this chunk (3-5 sentences)",
+    "key_points": [
+        {
+            "title": "Short title for this point",
+            "detail": "1-2 sentence explanation",
+            "insight": "Key takeaway"
+        }
+    ],
+    "topics": ["Topic1", "Topic2"]
+}
+RULES:
+- Extract 2-4 key points from this chunk.
+- Topics should be specific (e.g. "Python", "Neural Networks"), not generic.
+- OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
+""".strip()
+_MAP_USER = """
+Video Title: {video_title}
+Chunk {chunk_index} of {total_chunks}:
+{chunk_text}
+Extract the key information from this chunk. Return ONLY the JSON.
+""".strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — REDUCE PHASE
+# ─────────────────────────────────────────────────────────────────────────────
+_REDUCE_SYSTEM = """
+You are an expert educational content analyst and structured note-taking specialist.
+You will receive INTERMEDIATE SUMMARIES from multiple chunks of a single video transcript.
+Your job is to MERGE them into ONE final, cohesive, structured summary.
+LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
+- Use the detected language from the intermediate summaries.
+- Every content field MUST be in that SAME language.
+- Only "detected_language" and "suggested_category" are stated in English.
+TIMELINE RULES — STRICTLY ENFORCED:
+- Merge the chunk summaries into 3-7 chronological segments.
+- Each segment MUST cover a distinct phase or theme; do NOT repeat topics.
+- Segments must follow the natural progression of the video.
+- Each segment must include: title, summary, key_insight, why_it_matters.
+CATEGORY RULE:
+- Provide a single, concise category label (1-2 words max) in English.
+- This should be the most accurate high-level category for the video content.
+- Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
+- The suggested_category MUST always be in English regardless of the transcript language.
+CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
+{
+    "title": "Inferred video title in transcript language",
+    "detected_language": "English (or Arabic, etc.)",
+    "summary": "Concise overall summary (3-5 sentences)",
+    "segments": [
+        {
+            "title": "Segment title",
+            "summary": "What this section covers (2-3 sentences)",
+            "key_insight": "Most important point from this section",
+            "why_it_matters": "Why this is valuable (1-2 sentences)"
+        }
+    ],
+    "conclusion": "Final overall takeaway / closing conclusion",
+    "topics": ["Topic1", "Topic2", "Topic3"],
+    "suggested_category": "Programming"
+}
+OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
+""".strip()
+_REDUCE_USER = """
+Video Title: {video_title}
+The following are intermediate summaries extracted from {total_chunks} consecutive chunks
+of the video transcript. Merge them into ONE cohesive final summary.
+{merged_summaries}
+Merge into 3-7 chronological segments. Return ONLY the final JSON structure.
+""".strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
     return _LABELS.get(language, _LABELS["English"])
+# ─────────────────────────────────────────────────────────────────────────────
+# TOKEN UTILITIES
+# ─────────────────────────────────────────────────────────────────────────────
+def _estimate_tokens(text: str) -> int:
+    """
+    Lightweight token estimation using a word-count heuristic.
+    Production logs show that Groq's tokenizer produces ~2.5 tokens per
+    whitespace-delimited word for Arabic / mixed-script transcripts.
+    Using 2.5× as a conservative multiplier to avoid underestimation.
+    """
+    word_count = len(text.split())
+    return int(word_count * 2.5)
+def _split_into_chunks(text: str, target_tokens: int = _CHUNK_TARGET_TOKENS) -> List[str]:
+    """
+    Split text into chunks of approximately `target_tokens` tokens each.
+    Splits on sentence boundaries (period + space, newline) to avoid
+    cutting mid-sentence. Falls back to word-level splitting if no
+    sentence boundaries are found within a chunk.
+    """
+    # Split into sentences (on ". " or newline)
+    sentences = re.split(r'(?<=[.!?])\s+|\n+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_tokens = 0
+    for sentence in sentences:
+        sentence_tokens = _estimate_tokens(sentence)
+        # If a single sentence exceeds the target, split by words
+        if sentence_tokens > target_tokens:
+            # Flush current chunk first
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_tokens = 0
+            words = sentence.split()
+            word_buffer: List[str] = []
+            buffer_tokens = 0
+            for word in words:
+                wt = _estimate_tokens(word)
+                if buffer_tokens + wt > target_tokens and word_buffer:
+                    chunks.append(" ".join(word_buffer))
+                    word_buffer = [word]
+                    buffer_tokens = wt
+                else:
+                    word_buffer.append(word)
+                    buffer_tokens += wt
+            if word_buffer:
+                chunks.append(" ".join(word_buffer))
+            continue
+        if current_tokens + sentence_tokens > target_tokens and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentence]
+            current_tokens = sentence_tokens
+        else:
+            current_chunk.append(sentence)
+            current_tokens += sentence_tokens
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
 # ─────────────────────────────────────────────────────────────────────────────
 # NOTE GENERATOR
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
+    """
+    Generates structured study notes using Groq.
+    Automatically selects between:
+    - **Single-pass**: for short transcripts (< 8K tokens)
+    - **Map-Reduce**: for long transcripts (≥ 8K tokens), splitting into
+      chunks, summarizing each individually, then merging in a REDUCE pass.
+    Uses a single model (llama-3.3-70b-versatile) for all phases and
+    includes adaptive rate-limit retry (60s backoff on 413/429).
+    """
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
+        self.model = _MODEL_PRIMARY
+        self.chunk_delay = float(
+            os.environ.get("GROQ_CHUNK_DELAY_SECONDS", "3")
+        )
+        logger.info(
+            "🚀 NoteGenerator v5.1 initialized — model: %s, delay: %.1fs",
+            self.model, self.chunk_delay,
+        )
+    # ── Low-level API call ──────────────────────────────────────────────
+    def _chat(
+        self,
+        system: str,
+        user: str,
+        max_tokens: int = 4096,
+    ) -> Optional[str]:
+        """Send a chat completion request to Groq."""
         try:
             response = self.client.chat.completions.create(
+                model=self.model,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
             )
             return response.choices[0].message.content
         except Exception as e:
+            logger.error("❌ Groq API call failed (model=%s): %s", self.model, e)
             return None
+    # ── Error fallback ──────────────────────────────────────────────────
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
             "segments": [],
             "conclusion": "",
             "topics": [],
+            "suggested_category": "",
         }
+    # ── Single-pass summarization (short transcripts) ───────────────────
+    def _single_pass(self, transcript_text: str, video_title: str) -> Dict:
+        """Process the entire transcript in one API call."""
+        logger.info("📝 Single-pass summarization via %s", self.model)
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
+            transcript=transcript_text,
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
+            return self._get_error_json("Groq API call failed (single-pass).")
+        return self._parse_and_validate(raw)
+    # ── Map-Reduce summarization (long transcripts) ─────────────────────
+    def _map_reduce(self, transcript_text: str, video_title: str) -> Dict:
+        """
+        Split transcript into chunks, summarize each (MAP), then merge (REDUCE).
+        """
+        chunks = _split_into_chunks(transcript_text)
+        total = len(chunks)
+        logger.info(
+            "🗺️  Map-Reduce activated: %d chunks (delay=%.1fs between calls)",
+            total, self.chunk_delay,
+        )
+        # ── MAP PHASE ───────────────────────────────────────────────────
+        intermediate_results: List[Dict] = []
+        for i, chunk in enumerate(chunks, start=1):
+            chunk_tokens = _estimate_tokens(chunk)
+            logger.info(
+                "  📦 MAP chunk %d/%d (~%d est. tokens)...", i, total, chunk_tokens,
+            )
+            user_prompt = _MAP_USER.format(
+                video_title=video_title,
+                chunk_index=i,
+                total_chunks=total,
+                chunk_text=chunk,
+            )
+            # Retry loop with adaptive backoff on rate-limit errors
+            raw = None
+            for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
+                raw = self._chat(
+                    _MAP_SYSTEM, user_prompt,
+                    max_tokens=2048,
+                )
+                if raw is not None:
+                    break  # success
+                # _chat() returns None on any exception. Check if it was a
+                # rate-limit error (413 / 429) by inspecting the last
+                # exception.  We re-try with a 60s sleep.
+                logger.warning(
+                    "  ⚠️ MAP chunk %d/%d attempt %d/%d failed. "
+                    "Sleeping %ds for TPM window reset...",
+                    i, total, attempt, _RATE_LIMIT_MAX_RETRIES,
+                    _RATE_LIMIT_SLEEP_SECONDS,
+                )
+                time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
+            if raw:
+                try:
+                    parsed = json.loads(raw)
+                    intermediate_results.append(parsed)
+                    logger.info("  ✅ MAP chunk %d/%d done.", i, total)
+                except json.JSONDecodeError as e:
+                    logger.warning(
+                        "  ⚠️ MAP chunk %d/%d returned invalid JSON: %s", i, total, e,
+                    )
+            else:
+                logger.error(
+                    "  ❌ MAP chunk %d/%d failed after %d retries. Skipping.",
+                    i, total, _RATE_LIMIT_MAX_RETRIES,
+                )
+            # Respect TPM limits — delay between consecutive API calls
+            if i < total and self.chunk_delay > 0:
+                logger.info("  ⏳ Sleeping %.1fs (TPM cooldown)...", self.chunk_delay)
+                time.sleep(self.chunk_delay)
+        if not intermediate_results:
+            return self._get_error_json(
+                "Map-Reduce failed: no chunks were successfully summarized."
+            )
+        # ── REDUCE PHASE ────────────────────────────────────────────────
+        logger.info("🔗 REDUCE phase: merging %d intermediate summaries...", len(intermediate_results))
+        # Build a readable merged text for the reduce prompt
+        merged_parts: List[str] = []
+        all_topics: List[str] = []
+        detected_lang = "English"
+        for idx, result in enumerate(intermediate_results, start=1):
+            detected_lang = result.get("detected_language", detected_lang)
+            chunk_summary = result.get("chunk_summary", "")
+            key_points = result.get("key_points", [])
+            topics = result.get("topics", [])
+            all_topics.extend(topics)
+            part = f"--- Chunk {idx} ---\n"
+            part += f"Summary: {chunk_summary}\n"
+            for kp in key_points:
+                if isinstance(kp, dict):
+                    part += f"- {kp.get('title', '')}: {kp.get('detail', '')} "
+                    part += f"(Insight: {kp.get('insight', '')})\n"
+            part += f"Topics: {', '.join(topics)}\n"
+            merged_parts.append(part)
+        merged_text = "\n".join(merged_parts)
+        # Check if the merged text itself is within single-pass limits
+        reduce_tokens = _estimate_tokens(merged_text)
+        logger.info("🔗 REDUCE input: ~%d tokens", reduce_tokens)
+        user_prompt = _REDUCE_USER.format(
+            video_title=video_title,
+            total_chunks=len(intermediate_results),
+            merged_summaries=merged_text,
+        )
+        # Sleep before REDUCE to ensure TPM cooldown from last MAP call
+        if self.chunk_delay > 0:
+            logger.info("  ⏳ Sleeping %.1fs before REDUCE call...", self.chunk_delay)
+            time.sleep(self.chunk_delay)
+        # REDUCE with retry on rate-limit
+        raw = None
+        for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
+            raw = self._chat(_REDUCE_SYSTEM, user_prompt, max_tokens=4096)
+            if raw is not None:
+                break
+            logger.warning(
+                "  ⚠️ REDUCE attempt %d/%d failed. Sleeping %ds...",
+                attempt, _RATE_LIMIT_MAX_RETRIES, _RATE_LIMIT_SLEEP_SECONDS,
+            )
+            time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
+        if raw is None:
+            return self._get_error_json("Groq API call failed (REDUCE phase after retries).")
+        return self._parse_and_validate(raw)
+    # ── JSON parsing + schema validation ────────────────────────────────
+    def _parse_and_validate(self, raw_json: str) -> Dict:
+        """Parse raw JSON string and validate against SummarySchema."""
         try:
+            data = json.loads(raw_json)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
+            logger.error("❌ Schema validation failed: %s", e)
             return self._get_error_json(f"Validation Error: {str(e)}")
+    # ── Public API (unchanged signature) ────────────────────────────────
+    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
+        """
+        Generate structured JSON summary from transcript.
+        Automatically selects single-pass or Map-Reduce based on estimated
+        token count. The return type is always a Dict matching SummarySchema.
+        """
+        if not self.client:
+            return self._get_error_json("Groq API Key missing.")
+        # Estimate total tokens for the full prompt
+        full_prompt = _SUMMARY_USER.format(
+            video_title=video_title,
+            transcript=transcript_text,
+        )
+        total_tokens = _estimate_tokens(_SUMMARY_SYSTEM + full_prompt)
+        logger.info(
+            "📊 Token estimate: ~%d tokens (threshold: %d)",
+            total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
+        )
+        if total_tokens < _SINGLE_PASS_TOKEN_LIMIT:
+            return self._single_pass(transcript_text, video_title)
+        else:
+            logger.info(
+                "⚡ Transcript too large for single-pass (%d ≥ %d). "
+                "Activating Map-Reduce pipeline...",
+                total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
+            )
+            return self._map_reduce(transcript_text, video_title)
+    # ── Markdown formatting (unchanged) ─────────────────────────────────
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")

src/summarization/schemas.py CHANGED Viewed

@@ -81,4 +81,13 @@ class SummarySchema(BaseModel):
             "Dynamically extracted topics discussed in the video."
             " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
         ),
     )

             "Dynamically extracted topics discussed in the video."
             " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
         ),
+    )
+    suggested_category: str = Field(
+        ...,
+        description=(
+            "A single, concise category label (1-2 words max) that best"
+            " describes the video content. Must always be in English."
+            " Examples: 'Programming', 'Finance', 'History', 'Psychology'."
+        ),
     )

src/transcription/downloader.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import logging
+import os
+import re
+import json
+import tempfile
+import urllib.request
+from groq import Groq
+from pydub import AudioSegment
+logger = logging.getLogger(__name__)
+# Groq Whisper free-tier file size limit (bytes)
+_WHISPER_MAX_BYTES = 24 * 1024 * 1024  # 24 MB (safe margin under 25 MB)
+_WHISPER_MODEL = "whisper-large-v3-turbo"
+# ─────────────────────────────────────────────────────────────────────────────
+# Custom Exceptions
+# ─────────────────────────────────────────────────────────────────────────────
+class NoTranscriptError(RuntimeError):
+    """Raised when a video has no subtitles / captions available."""
+    pass
+# ─────────────────────────────────────────────────────────────────────────────
+# YouTubeDownloader
+# ─────────────────────────────────────────────────────────────────────────────
+class YouTubeDownloader:
+    """Extracts YouTube transcripts via Supadata or Deep Scan (Groq Whisper)."""
+    def __init__(self):
+        self._supadata_key = os.environ.get("SUPADATA_API_KEY", "").strip()
+        self._groq_key = os.environ.get("GROQ_API_KEY", "").strip()
+    # ── Primary path: Supadata transcript ─────────────────────────────
+    def get_transcript(self, url: str) -> str:
+        """
+        Fetch the full transcript for a YouTube video via Supadata.
+        Raises
+        ------
+        NoTranscriptError
+            If the video has no subtitles (Supadata returns empty content).
+        RuntimeError
+            If the API key is missing, request fails, or response is invalid.
+        """
+        video_id = self._extract_video_id(url)
+        logger.info("🔍 Fetching transcript for video ID: %s", video_id)
+        if not self._supadata_key:
+            raise RuntimeError(
+                "SUPADATA_API_KEY is not set. "
+                "Cannot fetch transcript without a valid API key."
+            )
+        clean_url = f"https://www.youtube.com/watch?v={video_id}"
+        headers = {
+            "x-api-key": self._supadata_key,
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/124.0.0.0 Safari/537.36"
+            ),
+        }
+        api_url = (
+            f"https://api.supadata.ai/v1/youtube/transcript"
+            f"?url={clean_url}&text=true"
+        )
+        try:
+            req = urllib.request.Request(api_url, headers=headers)
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                data = json.loads(resp.read())
+                text = data.get("content", "").strip()
+                if text:
+                    logger.info(
+                        "✅ Supadata transcript fetched (%d chars)", len(text)
+                    )
+                    return text
+                # Video exists but has no subtitles
+                raise NoTranscriptError(
+                    f"No subtitles found for video {video_id}. "
+                    "Deep scan required to extract audio."
+                )
+        except NoTranscriptError:
+            raise  # re-raise without wrapping
+        except urllib.error.HTTPError as e:
+            logger.error("❌ Supadata HTTP error %d: %s", e.code, e.reason)
+            raise RuntimeError(
+                f"Supadata API returned HTTP {e.code} ({e.reason}) "
+                f"for video {video_id}."
+            ) from e
+        except urllib.error.URLError as e:
+            logger.error("❌ Supadata connection error: %s", e.reason)
+            raise RuntimeError(
+                f"Could not reach Supadata API: {e.reason}"
+            ) from e
+        except json.JSONDecodeError as e:
+            logger.error("❌ Supadata returned invalid JSON: %s", e)
+            raise RuntimeError(
+                "Supadata API returned a non-JSON response."
+            ) from e
+    # ── Deep Scan path: pytubefix + Groq Whisper ──────────────────────
+    def deep_scan_transcript(self, url: str) -> str:
+        """
+        Download the video's audio and transcribe it via Groq Whisper.
+        Uses pytubefix to download audio, pydub to chunk large files,
+        and Groq Whisper API for speech-to-text.
+        Raises
+        ------
+        RuntimeError
+            If download or transcription fails.
+        """
+        video_id = self._extract_video_id(url)
+        logger.info("🎙️ Deep Scan started for video ID: %s", video_id)
+        if not self._groq_key:
+            raise RuntimeError(
+                "GROQ_API_KEY is not set. Cannot perform deep scan."
+            )
+        groq_client = Groq(api_key=self._groq_key)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Step 1: Download audio via pytubefix
+            audio_path = self._download_audio(url, tmpdir)
+            file_size = os.path.getsize(audio_path)
+            logger.info(
+                "📥 Audio downloaded: %s (%.1f MB)",
+                audio_path, file_size / (1024 * 1024),
+            )
+            # Step 2: Chunk if needed, then transcribe
+            if file_size <= _WHISPER_MAX_BYTES:
+                transcript = self._transcribe_file(groq_client, audio_path)
+            else:
+                transcript = self._transcribe_chunked(
+                    groq_client, audio_path, tmpdir
+                )
+            if not transcript.strip():
+                raise RuntimeError(
+                    f"Deep scan produced an empty transcript for {video_id}."
+                )
+            logger.info(
+                "✅ Deep Scan complete (%d chars)", len(transcript)
+            )
+            return transcript
+    def _download_audio(self, url: str, output_dir: str) -> str:
+        """Download audio-only stream via pytubefix."""
+        try:
+            from pytubefix import YouTube
+            clean_url = f"https://www.youtube.com/watch?v={self._extract_video_id(url)}"
+            yt = YouTube(clean_url)
+            stream = yt.streams.get_audio_only()
+            if stream is None:
+                raise RuntimeError("No audio stream available for this video.")
+            logger.info("⬇️ Downloading audio stream: %s", stream)
+            output_path = stream.download(output_path=output_dir)
+            return output_path
+        except Exception as e:
+            logger.error("❌ Audio download failed: %s", e)
+            raise RuntimeError(
+                f"Failed to download audio: {e}"
+            ) from e
+    def _transcribe_file(self, client: Groq, file_path: str) -> str:
+        """Transcribe a single audio file via Groq Whisper."""
+        logger.info("🎤 Transcribing file: %s", os.path.basename(file_path))
+        try:
+            with open(file_path, "rb") as f:
+                result = client.audio.transcriptions.create(
+                    file=(os.path.basename(file_path), f.read()),
+                    model=_WHISPER_MODEL,
+                    response_format="text",
+                    temperature=0.0,
+                )
+            return result if isinstance(result, str) else str(result)
+        except Exception as e:
+            logger.error("❌ Whisper transcription failed: %s", e)
+            raise RuntimeError(
+                f"Groq Whisper transcription failed: {e}"
+            ) from e
+    def _transcribe_chunked(
+        self, client: Groq, file_path: str, tmpdir: str
+    ) -> str:
+        """
+        Split a large audio file into chunks under 24 MB, transcribe each,
+        and concatenate the results.
+        """
+        logger.info("✂️ Audio file too large — splitting into chunks...")
+        # Load audio with pydub
+        audio = AudioSegment.from_file(file_path)
+        total_ms = len(audio)
+        file_size = os.path.getsize(file_path)
+        # Calculate chunk duration to stay under the size limit
+        # Ratio: (target bytes / total bytes) * total duration
+        ratio = _WHISPER_MAX_BYTES / file_size
+        chunk_duration_ms = int(total_ms * ratio * 0.9)  # 10% safety margin
+        chunk_duration_ms = max(chunk_duration_ms, 60_000)  # min 1 minute
+        chunks_text = []
+        chunk_index = 0
+        offset = 0
+        while offset < total_ms:
+            chunk_end = min(offset + chunk_duration_ms, total_ms)
+            chunk = audio[offset:chunk_end]
+            chunk_index += 1
+            chunk_path = os.path.join(tmpdir, f"chunk_{chunk_index}.mp3")
+            chunk.export(chunk_path, format="mp3", bitrate="64k")
+            chunk_size = os.path.getsize(chunk_path)
+            logger.info(
+                "  📦 Chunk %d: %d-%ds (%.1f MB)",
+                chunk_index,
+                offset // 1000,
+                chunk_end // 1000,
+                chunk_size / (1024 * 1024),
+            )
+            text = self._transcribe_file(client, chunk_path)
+            chunks_text.append(text)
+            offset = chunk_end
+        logger.info(
+            "✅ Transcribed %d chunks, total %d chars",
+            len(chunks_text),
+            sum(len(t) for t in chunks_text),
+        )
+        return " ".join(chunks_text)
+    # ── Helpers ───────────────────────────────────────────────────���───
+    def _extract_video_id(self, url: str) -> str:
+        """Extract the 11-character video ID from any YouTube URL format."""
+        match = re.search(
+            r"(?:v=|youtu\.be/|shorts/|embed/)([A-Za-z0-9_-]{11})", str(url)
+        )
+        return match.group(1) if match else "unknown"
+    def cleanup(self, path=None):
+        pass