Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

amanyelfiky commited on about 19 hours ago

Commit

c00d17d

1 Parent(s): 3d4b8b5

Mt5

Browse files

Files changed (6) hide show

pyproject.toml +3 -1
requirements.txt +49 -0
src/summarization/README.md +13 -3
src/summarization/__pycache__/note_generator.cpython-312.pyc +0 -0
src/summarization/note_generator.py +23 -355
src/summarization/segmenter.py +33 -0

pyproject.toml CHANGED Viewed

@@ -11,13 +11,13 @@ dependencies = [
     "email-validator>=2.3.0",
     "fastapi==0.109.0",
     "google-api-python-client==2.115.0",
-    "groq>=0.9.0",
     "google-genai>=1.2.0",
     "google-generativeai==0.3.2",
     "greenlet==3.3.1",
     "httpx==0.26.0",
     "langchain==0.1.0",
     "langchain-google-genai==0.0.5",
     "openai-whisper==20250625",
     "passlib[bcrypt]==1.7.4",
     "pydantic-core==2.41.5",
@@ -27,9 +27,11 @@ dependencies = [
     "python-dotenv==1.0.0",
     "python-jose[cryptography]==3.3.0",
     "python-multipart==0.0.6",
     "sqlmodel==0.0.14",
     "torch>=2.10.0",
     "torchaudio>=2.10.0",
     "uvicorn[standard]==0.27.0",
 ]

     "email-validator>=2.3.0",
     "fastapi==0.109.0",
     "google-api-python-client==2.115.0",
     "google-genai>=1.2.0",
     "google-generativeai==0.3.2",
     "greenlet==3.3.1",
     "httpx==0.26.0",
     "langchain==0.1.0",
     "langchain-google-genai==0.0.5",
+    "langdetect>=1.0.9",
     "openai-whisper==20250625",
     "passlib[bcrypt]==1.7.4",
     "pydantic-core==2.41.5",
     "python-dotenv==1.0.0",
     "python-jose[cryptography]==3.3.0",
     "python-multipart==0.0.6",
+    "sentencepiece>=0.2.0",
     "sqlmodel==0.0.14",
     "torch>=2.10.0",
     "torchaudio>=2.10.0",
+    "transformers>=4.40.0",
     "uvicorn[standard]==0.27.0",
 ]

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
 assemblyai>=0.30.0
 yt-dlp>=2025.05.22
@@ -48,3 +49,51 @@ pytubefix
 # --- ML & Recommendations ---
 # keybert
 # sentence-transformers

+<<<<<<< HEAD
 # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
 assemblyai>=0.30.0
 yt-dlp>=2025.05.22
 # --- ML & Recommendations ---
 # keybert
 # sentence-transformers
+=======
+# --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
+assemblyai>=0.30.0
+yt-dlp>=2025.05.22
+bgutil-ytdlp-pot-provider==1.3.1
+youtube-transcript-api==0.6.2
+# --- AI, LLMs & Transcription Fallback ---
+openai-whisper==20250625
+torch
+torchaudio
+transformers>=4.40.0
+sentencepiece>=0.2.0
+langdetect>=1.0.9
+google-genai
+google-generativeai==0.3.2
+langchain==0.1.0
+langchain-google-genai==0.0.5
+# --- Backend Infrastructure (FastAPI) ---
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+pydantic==2.12.5
+pydantic[email]
+email-validator
+pydantic-settings==2.1.0
+python-multipart==0.0.6
+python-dotenv==1.0.0
+httpx==0.26.0
+aiofiles==23.2.1
+# --- Database, Auth & Security ---
+sqlmodel==0.0.14
+asyncpg==0.31.0
+greenlet==3.3.1
+passlib[bcrypt]==1.7.4
+python-jose[cryptography]==3.3.0
+bcrypt==4.1.2
+pydantic-core==2.41.5
+# --- Integration & Utilities ---
+google-api-python-client==2.115.0
+firebase-admin==6.5.0
+dnspython
+pydub==0.25.1
+ffmpeg-python
+>>>>>>> b5ab912 (Mt5)

src/summarization/README.md CHANGED Viewed

@@ -5,7 +5,7 @@ This module handles **text summarization and conversion to structured study note
 ## Functionality
 1. Receive transcribed text from videos.
-2. Use **Groq (Llama-3.3-70b-versatile)** to analyze text and generate structured JSON notes.
 3. Produce clean Markdown output with:
    - Source & Duration header
    - Overall Summary
@@ -21,7 +21,7 @@ This module handles **text summarization and conversion to structured study note
   - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
 ### 2. `note_generator.py`
-- **Purpose:** Generate notes using Groq AI with strict JSON enforcement.
 - **Main Class:** `NoteGenerator`
 - **Key Methods:**
   - `generateSummary(transcript, title)` — Generates structured JSON study notes.
@@ -32,7 +32,9 @@ This module handles **text summarization and conversion to structured study note
 - **Purpose:** Split long texts into smaller segments for preprocessing.
 - **Main Class:** `TranscriptSegmenter`
 - **Key Methods:**
   - `segment_by_time()` — Split by time intervals.
   - `clean_text()` — Remove filler words.
 ## JSON Output Structure
@@ -90,5 +92,13 @@ print(notes_md)
 ```
 ## Libraries Used
-- `groq` — Communicate with Groq API (Llama-3.3-70b-versatile).
 - `pydantic` — Data validation and schema enforcement.

 ## Functionality
 1. Receive transcribed text from videos.
+2. Use a **local mT5 model** (map-reduce pipeline) to analyze text and generate structured JSON notes.
 3. Produce clean Markdown output with:
    - Source & Duration header
    - Overall Summary
   - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
 ### 2. `note_generator.py`
+- **Purpose:** Generate notes using a local mT5 model with chunk-based map-reduce and schema validation.
 - **Main Class:** `NoteGenerator`
 - **Key Methods:**
   - `generateSummary(transcript, title)` — Generates structured JSON study notes.
 - **Purpose:** Split long texts into smaller segments for preprocessing.
 - **Main Class:** `TranscriptSegmenter`
 - **Key Methods:**
+  - `segment_text_by_words()` — Split text into fixed-size word chunks (used by the mT5 pipeline).
   - `segment_by_time()` — Split by time intervals.
+  - `segment_by_topic()` — Split by paragraph/topic boundaries.
   - `clean_text()` — Remove filler words.
 ## JSON Output Structure
 ```
 ## Libraries Used
+- `transformers` — Load and run the local mT5 model (HuggingFace).
+- `sentencepiece` — Tokenizer backend required by mT5.
+- `langdetect` — Automatic language detection for multilingual support.
+- `torch` — PyTorch runtime for model inference.
 - `pydantic` — Data validation and schema enforcement.
+## Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MT5_MODEL_NAME` | `google/mt5-small` | HuggingFace model ID to load |

src/summarization/__pycache__/note_generator.cpython-312.pyc DELETED Viewed

Binary file (11.2 kB)

src/summarization/note_generator.py CHANGED Viewed

@@ -1,41 +1,22 @@
 import json
 import os
-import re
-import time
-from typing import Dict, List, Optional
-from groq import Groq
 from pydantic import ValidationError
 from ..utils.logger import setup_logger
 from .schemas import SummarySchema
 logger = setup_logger(__name__)
 # ─────────────────────────────────────────────────────────────────────────────
-# CONFIGURATION
-# ─────────────────────────────────────────────────────────────────────────────
-# Token threshold: below this, a single API call is used.
-_SINGLE_PASS_TOKEN_LIMIT = 8_000
-# Target chunk size for MAP phase (tokens).  Kept small so that
-# prompt + chunk + response stays well under the 12K TPM free-tier limit.
-_CHUNK_TARGET_TOKENS = 2_500
-# Model — unified for both MAP and REDUCE phases.
-# llama-3.3-70b-versatile has 12K TPM on the free tier (the highest).
-_MODEL_PRIMARY = "llama-3.3-70b-versatile"
-# Maximum retries when a rate-limit (413 / 429) is hit.
-_RATE_LIMIT_MAX_RETRIES = 3
-_RATE_LIMIT_SLEEP_SECONDS = 60
-# ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES — SINGLE-PASS (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
@@ -46,7 +27,7 @@ LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
 - Detect the primary language of the transcript.
 - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
-- Only the "detected_language" and "suggested_category" fields are stated in English.
 TIMELINE RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
@@ -64,12 +45,6 @@ TOPICS RULE:
 - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
 - Do NOT use generic fixed categories.
-CATEGORY RULE:
-- Provide a single, concise category label (1-2 words max) in English.
-- This should be the most accurate high-level category for the video content.
-- Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
-- The suggested_category MUST always be in English regardless of the transcript language.
 CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
@@ -85,8 +60,7 @@ DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
         }
     ],
     "conclusion": "Final overall takeaway / closing conclusion",
-    "topics": ["Topic1", "Topic2", "Topic3"],
-    "suggested_category": "Programming"
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -105,109 +79,6 @@ Return ONLY the exact JSON structure requested.
 """.strip()
-# ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES — MAP PHASE
-# ─────────────────────────────────────────────────────────────────────────────
-_MAP_SYSTEM = """
-You are an expert educational content analyst.
-You will receive ONE CHUNK of a longer video transcript.
-Extract the key information from this chunk ONLY.
-LANGUAGE RULE — CRITICAL:
-- Detect the primary language of the text.
-- Write ALL content fields in that SAME detected language.
-- Only "detected_language" is stated in English.
-Return a JSON object with this EXACT structure:
-{
-    "detected_language": "English (or Arabic, etc.)",
-    "chunk_summary": "Concise summary of this chunk (3-5 sentences)",
-    "key_points": [
-        {
-            "title": "Short title for this point",
-            "detail": "1-2 sentence explanation",
-            "insight": "Key takeaway"
-        }
-    ],
-    "topics": ["Topic1", "Topic2"]
-}
-RULES:
-- Extract 2-4 key points from this chunk.
-- Topics should be specific (e.g. "Python", "Neural Networks"), not generic.
-- OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
-""".strip()
-_MAP_USER = """
-Video Title: {video_title}
-Chunk {chunk_index} of {total_chunks}:
-{chunk_text}
-Extract the key information from this chunk. Return ONLY the JSON.
-""".strip()
-# ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES — REDUCE PHASE
-# ─────────────────────────────────────────────────────────────────────────────
-_REDUCE_SYSTEM = """
-You are an expert educational content analyst and structured note-taking specialist.
-You will receive INTERMEDIATE SUMMARIES from multiple chunks of a single video transcript.
-Your job is to MERGE them into ONE final, cohesive, structured summary.
-LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
-- Use the detected language from the intermediate summaries.
-- Every content field MUST be in that SAME language.
-- Only "detected_language" and "suggested_category" are stated in English.
-TIMELINE RULES — STRICTLY ENFORCED:
-- Merge the chunk summaries into 3-7 chronological segments.
-- Each segment MUST cover a distinct phase or theme; do NOT repeat topics.
-- Segments must follow the natural progression of the video.
-- Each segment must include: title, summary, key_insight, why_it_matters.
-CATEGORY RULE:
-- Provide a single, concise category label (1-2 words max) in English.
-- This should be the most accurate high-level category for the video content.
-- Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
-- The suggested_category MUST always be in English regardless of the transcript language.
-CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
-{
-    "title": "Inferred video title in transcript language",
-    "detected_language": "English (or Arabic, etc.)",
-    "summary": "Concise overall summary (3-5 sentences)",
-    "segments": [
-        {
-            "title": "Segment title",
-            "summary": "What this section covers (2-3 sentences)",
-            "key_insight": "Most important point from this section",
-            "why_it_matters": "Why this is valuable (1-2 sentences)"
-        }
-    ],
-    "conclusion": "Final overall takeaway / closing conclusion",
-    "topics": ["Topic1", "Topic2", "Topic3"],
-    "suggested_category": "Programming"
-}
-OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
-""".strip()
-_REDUCE_USER = """
-Video Title: {video_title}
-The following are intermediate summaries extracted from {total_chunks} consecutive chunks
-of the video transcript. Merge them into ONE cohesive final summary.
-{merged_summaries}
-Merge into 3-7 chronological segments. Return ONLY the final JSON structure.
-""".strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
@@ -316,42 +187,18 @@ def _split_into_chunks(text: str, target_tokens: int = _CHUNK_TARGET_TOKENS) ->
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
-    """
-    Generates structured study notes using Groq.
-    Automatically selects between:
-    - **Single-pass**: for short transcripts (< 8K tokens)
-    - **Map-Reduce**: for long transcripts (≥ 8K tokens), splitting into
-      chunks, summarizing each individually, then merging in a REDUCE pass.
-    Uses a single model (llama-3.3-70b-versatile) for all phases and
-    includes adaptive rate-limit retry (60s backoff on 413/429).
-    """
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
-        self.model = _MODEL_PRIMARY
-        self.chunk_delay = float(
-            os.environ.get("GROQ_CHUNK_DELAY_SECONDS", "3")
-        )
-        logger.info(
-            "🚀 NoteGenerator v5.1 initialized — model: %s, delay: %.1fs",
-            self.model, self.chunk_delay,
-        )
-    # ── Low-level API call ──────────────────────────────────────────────
-    def _chat(
-        self,
-        system: str,
-        user: str,
-        max_tokens: int = 4096,
-    ) -> Optional[str]:
-        """Send a chat completion request to Groq."""
         try:
             response = self.client.chat.completions.create(
-                model=self.model,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
@@ -362,11 +209,9 @@ class NoteGenerator:
             )
             return response.choices[0].message.content
         except Exception as e:
-            logger.error("❌ Groq API call failed (model=%s): %s", self.model, e)
             return None
-    # ── Error fallback ──────────────────────────────────────────────────
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
@@ -375,208 +220,31 @@ class NoteGenerator:
             "segments": [],
             "conclusion": "",
             "topics": [],
-            "suggested_category": "",
         }
-    # ── Single-pass summarization (short transcripts) ───────────────────
-    def _single_pass(self, transcript_text: str, video_title: str) -> Dict:
-        """Process the entire transcript in one API call."""
-        logger.info("📝 Single-pass summarization via %s", self.model)
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
-            transcript=transcript_text,
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
-            return self._get_error_json("Groq API call failed (single-pass).")
-        return self._parse_and_validate(raw)
-    # ── Map-Reduce summarization (long transcripts) ─────────────────────
-    def _map_reduce(self, transcript_text: str, video_title: str) -> Dict:
-        """
-        Split transcript into chunks, summarize each (MAP), then merge (REDUCE).
-        """
-        chunks = _split_into_chunks(transcript_text)
-        total = len(chunks)
-        logger.info(
-            "🗺️  Map-Reduce activated: %d chunks (delay=%.1fs between calls)",
-            total, self.chunk_delay,
-        )
-        # ── MAP PHASE ───────────────────────────────────────────────────
-        intermediate_results: List[Dict] = []
-        for i, chunk in enumerate(chunks, start=1):
-            chunk_tokens = _estimate_tokens(chunk)
-            logger.info(
-                "  📦 MAP chunk %d/%d (~%d est. tokens)...", i, total, chunk_tokens,
-            )
-            user_prompt = _MAP_USER.format(
-                video_title=video_title,
-                chunk_index=i,
-                total_chunks=total,
-                chunk_text=chunk,
-            )
-            # Retry loop with adaptive backoff on rate-limit errors
-            raw = None
-            for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
-                raw = self._chat(
-                    _MAP_SYSTEM, user_prompt,
-                    max_tokens=2048,
-                )
-                if raw is not None:
-                    break  # success
-                # _chat() returns None on any exception. Check if it was a
-                # rate-limit error (413 / 429) by inspecting the last
-                # exception.  We re-try with a 60s sleep.
-                logger.warning(
-                    "  ⚠️ MAP chunk %d/%d attempt %d/%d failed. "
-                    "Sleeping %ds for TPM window reset...",
-                    i, total, attempt, _RATE_LIMIT_MAX_RETRIES,
-                    _RATE_LIMIT_SLEEP_SECONDS,
-                )
-                time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
-            if raw:
-                try:
-                    parsed = json.loads(raw)
-                    intermediate_results.append(parsed)
-                    logger.info("  ✅ MAP chunk %d/%d done.", i, total)
-                except json.JSONDecodeError as e:
-                    logger.warning(
-                        "  ⚠️ MAP chunk %d/%d returned invalid JSON: %s", i, total, e,
-                    )
-            else:
-                logger.error(
-                    "  ❌ MAP chunk %d/%d failed after %d retries. Skipping.",
-                    i, total, _RATE_LIMIT_MAX_RETRIES,
-                )
-            # Respect TPM limits — delay between consecutive API calls
-            if i < total and self.chunk_delay > 0:
-                logger.info("  ⏳ Sleeping %.1fs (TPM cooldown)...", self.chunk_delay)
-                time.sleep(self.chunk_delay)
-        if not intermediate_results:
-            return self._get_error_json(
-                "Map-Reduce failed: no chunks were successfully summarized."
-            )
-        # ── REDUCE PHASE ────────────────────────────────────────────────
-        logger.info("🔗 REDUCE phase: merging %d intermediate summaries...", len(intermediate_results))
-        # Build a readable merged text for the reduce prompt
-        merged_parts: List[str] = []
-        all_topics: List[str] = []
-        detected_lang = "English"
-        for idx, result in enumerate(intermediate_results, start=1):
-            detected_lang = result.get("detected_language", detected_lang)
-            chunk_summary = result.get("chunk_summary", "")
-            key_points = result.get("key_points", [])
-            topics = result.get("topics", [])
-            all_topics.extend(topics)
-            part = f"--- Chunk {idx} ---\n"
-            part += f"Summary: {chunk_summary}\n"
-            for kp in key_points:
-                if isinstance(kp, dict):
-                    part += f"- {kp.get('title', '')}: {kp.get('detail', '')} "
-                    part += f"(Insight: {kp.get('insight', '')})\n"
-            part += f"Topics: {', '.join(topics)}\n"
-            merged_parts.append(part)
-        merged_text = "\n".join(merged_parts)
-        # Check if the merged text itself is within single-pass limits
-        reduce_tokens = _estimate_tokens(merged_text)
-        logger.info("🔗 REDUCE input: ~%d tokens", reduce_tokens)
-        user_prompt = _REDUCE_USER.format(
-            video_title=video_title,
-            total_chunks=len(intermediate_results),
-            merged_summaries=merged_text,
-        )
-        # Sleep before REDUCE to ensure TPM cooldown from last MAP call
-        if self.chunk_delay > 0:
-            logger.info("  ⏳ Sleeping %.1fs before REDUCE call...", self.chunk_delay)
-            time.sleep(self.chunk_delay)
-        # REDUCE with retry on rate-limit
-        raw = None
-        for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
-            raw = self._chat(_REDUCE_SYSTEM, user_prompt, max_tokens=4096)
-            if raw is not None:
-                break
-            logger.warning(
-                "  ⚠️ REDUCE attempt %d/%d failed. Sleeping %ds...",
-                attempt, _RATE_LIMIT_MAX_RETRIES, _RATE_LIMIT_SLEEP_SECONDS,
-            )
-            time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
-        if raw is None:
-            return self._get_error_json("Groq API call failed (REDUCE phase after retries).")
-        return self._parse_and_validate(raw)
-    # ── JSON parsing + schema validation ────────────────────────────────
-    def _parse_and_validate(self, raw_json: str) -> Dict:
-        """Parse raw JSON string and validate against SummarySchema."""
         try:
-            data = json.loads(raw_json)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
-            logger.error("❌ Schema validation failed: %s", e)
             return self._get_error_json(f"Validation Error: {str(e)}")
-    # ── Public API (unchanged signature) ────────────────────────────────
-    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
-        """
-        Generate structured JSON summary from transcript.
-        Automatically selects single-pass or Map-Reduce based on estimated
-        token count. The return type is always a Dict matching SummarySchema.
-        """
-        if not self.client:
-            return self._get_error_json("Groq API Key missing.")
-        # Estimate total tokens for the full prompt
-        full_prompt = _SUMMARY_USER.format(
-            video_title=video_title,
-            transcript=transcript_text,
-        )
-        total_tokens = _estimate_tokens(_SUMMARY_SYSTEM + full_prompt)
-        logger.info(
-            "📊 Token estimate: ~%d tokens (threshold: %d)",
-            total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
-        )
-        if total_tokens < _SINGLE_PASS_TOKEN_LIMIT:
-            return self._single_pass(transcript_text, video_title)
-        else:
-            logger.info(
-                "⚡ Transcript too large for single-pass (%d ≥ %d). "
-                "Activating Map-Reduce pipeline...",
-                total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
-            )
-            return self._map_reduce(transcript_text, video_title)
-    # ── Markdown formatting (unchanged) ─────────────────────────────────
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")

 import json
 import os
+from typing import Dict, Optional
+import torch
+from langdetect import detect, LangDetectException
 from pydantic import ValidationError
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from ..utils.logger import setup_logger
 from .schemas import SummarySchema
+from .segmenter import TranscriptSegmenter
 logger = setup_logger(__name__)
 # ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
 - Detect the primary language of the transcript.
 - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
+- Only the "detected_language" field itself is stated in English (e.g. "Arabic").
 TIMELINE RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
 - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
 - Do NOT use generic fixed categories.
 CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
         }
     ],
     "conclusion": "Final overall takeaway / closing conclusion",
+    "topics": ["Topic1", "Topic2", "Topic3"]
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 """.strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
+    """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
+        self.model_id = "llama-3.3-70b-versatile"
+        logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
+    def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
         try:
             response = self.client.chat.completions.create(
+                model=self.model_id,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
             )
             return response.choices[0].message.content
         except Exception as e:
+            logger.error(f"❌ Groq API call failed: {e}")
             return None
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
             "segments": [],
             "conclusion": "",
             "topics": [],
         }
+    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
+        """Generate structured JSON summary from transcript."""
+        if not self.client:
+            return self._get_error_json("Groq API Key missing.")
+        logger.info(f"📝 Summary generation started via {self.model_id}")
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
+            transcript=transcript_text[:30000],
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
+            return self._get_error_json("Groq API call failed.")
         try:
+            data = json.loads(raw)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
+            logger.error(f"❌ Schema validation failed: {e}")
             return self._get_error_json(f"Validation Error: {str(e)}")
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")

src/summarization/segmenter.py CHANGED Viewed

@@ -149,6 +149,39 @@ class TranscriptSegmenter:
         return segments
     def segment_transcript(
         self,
         transcript_data: Dict,

         return segments
+    def segment_text_by_words(
+        self,
+        text: str,
+        chunk_size: int = 350
+    ) -> List[str]:
+        """
+        Split plain text into fixed-size word chunks.
+        This provides deterministic chunking suitable for models with
+        strict token limits (e.g. mT5's 512-token input window).
+        Args:
+            text: Full transcript text
+            chunk_size: Maximum words per chunk (default: 350)
+        Returns:
+            List of text chunks, each up to chunk_size words
+        """
+        words = text.split()
+        if not words:
+            return []
+        chunks = []
+        for i in range(0, len(words), chunk_size):
+            chunk = " ".join(words[i:i + chunk_size])
+            chunks.append(chunk)
+        logger.info(
+            f"Split {len(words)} words into {len(chunks)} chunks "
+            f"(max {chunk_size} words each)"
+        )
+        return chunks
     def segment_transcript(
         self,
         transcript_data: Dict,