Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

amanyelfiky commited on 17 days ago

Commit

3c66ec8

1 Parent(s): 97312dd

Refactor summarization module and update schemas

Browse files

Files changed (5) hide show

src/api/main.py +11 -22
src/api/notes_routes.py +1 -7
src/summarization/README.md +64 -37
src/summarization/note_generator.py +85 -232
src/summarization/schemas.py +35 -43

src/api/main.py CHANGED Viewed

@@ -54,7 +54,7 @@ class TaskStatusResponse(BaseModel):
     user_id: str | None = None
     created_at: datetime | None = None
     notes: str | None = None
-    category: str | None = "Uncategorized"
     keyPoints: list | None = []
@@ -221,18 +221,9 @@ async def process_video_and_save(
         tasks[task_id]["status"] = TaskStatus.GENERATING_NOTES
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_info["title"])
-        # 3. Extract Action Items
-        action_items_json = note_gen.extractActionItems(transcript_text, video_info["title"])
-        # Combine results
-        combined_json = {
-            **summary_json,
-            "action_items": action_items_json.get("action_items", [])
-        }
         final_notes = note_gen.format_final_notes(
-            note_gen.format_notes_to_markdown(combined_json),
             video_info["title"],
             youtube_url,
             video_info["duration"],
@@ -245,7 +236,7 @@ async def process_video_and_save(
                 "video_url": youtube_url,
                 "video_title": video_info["title"],
                 "summary_content": final_notes,
-                "category": combined_json.get("category", "Uncategorized"),
                 "created_at": datetime.utcnow()
             }
             db.collection("notes").add(note_data)
@@ -253,17 +244,15 @@ async def process_video_and_save(
             logger.warning("Firestore not initialized, note not saved to DB but generated in memory.")
         tasks[task_id]["notes"] = final_notes
-        tasks[task_id]["category"] = combined_json.get("category", "Uncategorized")
-        # safely extract strings from KeyConcept objects for Flutter frontend
-        key_concepts = combined_json.get("key_concepts", [])
         key_points_list = []
-        for kc in key_concepts:
-            if isinstance(kc, dict):
-                key_points_list.append(f"{kc.get('term', '')}: {kc.get('definition', '')}")
-            else:
-                key_points_list.append(str(kc))
         tasks[task_id]["keyPoints"] = key_points_list
         tasks[task_id]["status"] = TaskStatus.COMPLETED
     except Exception as e:

     user_id: str | None = None
     created_at: datetime | None = None
     notes: str | None = None
+    topics: list | None = []
     keyPoints: list | None = []
         tasks[task_id]["status"] = TaskStatus.GENERATING_NOTES
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_info["title"])
         final_notes = note_gen.format_final_notes(
+            note_gen.format_notes_to_markdown(summary_json),
             video_info["title"],
             youtube_url,
             video_info["duration"],
                 "video_url": youtube_url,
                 "video_title": video_info["title"],
                 "summary_content": final_notes,
+                "topics": summary_json.get("topics", []),
                 "created_at": datetime.utcnow()
             }
             db.collection("notes").add(note_data)
             logger.warning("Firestore not initialized, note not saved to DB but generated in memory.")
         tasks[task_id]["notes"] = final_notes
+        tasks[task_id]["topics"] = summary_json.get("topics", [])
+        # Extract key insights from segments for Flutter frontend
+        segments = summary_json.get("segments", [])
         key_points_list = []
+        for seg in segments:
+            if isinstance(seg, dict) and seg.get("key_insight"):
+                key_points_list.append(seg["key_insight"])
         tasks[task_id]["keyPoints"] = key_points_list
         tasks[task_id]["status"] = TaskStatus.COMPLETED
     except Exception as e:

src/api/notes_routes.py CHANGED Viewed

@@ -282,19 +282,13 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
         tasks[task_id]["status"] = "generating_notes"
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
-        action_items = note_gen.extractActionItems(transcript_text, video_title)
-        combined_notes = {
-            **summary_json,
-            "action_items": action_items.get("action_items", [])
-        }
         # استخراج مدة الفيديو الحقيقية بدلاً من الصفر المبرمج
         tasks[task_id]["message"] = "Fetching video metadata..."
         video_duration = get_youtube_duration(youtube_url)
         final_markdown = note_gen.format_final_notes(
-            note_gen.format_notes_to_markdown(combined_notes),
             video_title,
             youtube_url,
             video_duration,

         tasks[task_id]["status"] = "generating_notes"
         note_gen = NoteGenerator()
         summary_json = note_gen.generateSummary(transcript_text, video_title)
         # استخراج مدة الفيديو الحقيقية بدلاً من الصفر المبرمج
         tasks[task_id]["message"] = "Fetching video metadata..."
         video_duration = get_youtube_duration(youtube_url)
         final_markdown = note_gen.format_final_notes(
+            note_gen.format_notes_to_markdown(summary_json),
             video_title,
             youtube_url,
             video_duration,

src/summarization/README.md CHANGED Viewed

@@ -1,67 +1,94 @@
 # Summarization Module 📝
 ## Responsibility
-This module handles **text summarization and conversion to study notes**.
 ## Functionality
 1. Receive transcribed text from videos.
-2. Use **Google Gemini** to analyze text and convert it to organized notes.
-3. Create a Markdown file containing:
-   - General summary
-   - Key concepts
-   - Timeline
-   - Action items
 ## Files
-### 1. `note_generator.py`
-- **Purpose:** Generate notes using Gemini AI.
 - **Main Class:** `NoteGenerator`
 - **Key Methods:**
-  - `generate_notes_json(transcript, title)` - Generates structured JSON.
-  - `format_notes_to_markdown(json_notes)` - Converts JSON to Markdown.
-### 2. `schemas.py`
-- **Purpose:** Define data structure (Schema) for notes.
-- **Main Class:** `StudyNoteSchema`
-- **Fields:**
-  - `summary` - General summary.
-  - `key_concepts` - List of concepts and definitions.
-  - `timestamps` - Timeline of topics.
-  - `action_items` - Suggested tasks or exercises.
 ### 3. `segmenter.py`
-- **Purpose:** Split long texts into smaller segments.
 - **Main Class:** `TranscriptSegmenter`
 - **Key Methods:**
-  - `segment_by_time()` - Split by time (e.g., every 5 minutes).
-  - `clean_text()` - Remove filler words (um, uh, like).
-## Proposed Enhancements
-- [ ] Add support for diagrams and illustrations.
-- [ ] Improve prompts for more detailed summaries.
-- [ ] Add translation feature to Arabic.
 ## Testing
 ```python
-from src.ai_modules.summarization.note_generator import NoteGenerator
 generator = NoteGenerator()
 transcript = "Here is the complete video transcript..."
 title = "Introduction to Python"
 # Generate notes
-notes_json = generator.generate_notes_json(transcript, title)
-notes_md = generator.format_notes_to_markdown(notes_json)
 print(notes_md)
 ```
 ## Libraries Used
-- `google-genai` - Communicate with Google Gemini.
-- `pydantic` - Data validation.
-## Important Notes
-- Currently using `gemini-flash-latest` model.
-- Summary quality can be improved by modifying the `SYSTEM_PROMPT`.
-- The Schema ensures the output is always in valid JSON format.

 # Summarization Module 📝
 ## Responsibility
+This module handles **text summarization and conversion to structured study notes**.
 ## Functionality
 1. Receive transcribed text from videos.
+2. Use **Groq (Llama-3.3-70b-versatile)** to analyze text and generate structured JSON notes.
+3. Produce clean Markdown output with:
+   - Source & Duration header
+   - Overall Summary
+   - Chronological Timeline (3-7 segments with Key Insight + Why It Matters)
+   - Conclusion
 ## Files
+### 1. `schemas.py`
+- **Purpose:** Single source of truth for all Pydantic data models.
+- **Key Classes:**
+  - `SummarySchema` — Full structured output (title, detected_language, summary, segments, conclusion, topics).
+  - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
+### 2. `note_generator.py`
+- **Purpose:** Generate notes using Groq AI with strict JSON enforcement.
 - **Main Class:** `NoteGenerator`
 - **Key Methods:**
+  - `generateSummary(transcript, title)` — Generates structured JSON study notes.
+  - `format_notes_to_markdown(json_notes)` — Converts JSON to clean Markdown.
+  - `format_final_notes(notes, title, url, duration)` — Wraps Markdown with Source/Duration header.
 ### 3. `segmenter.py`
+- **Purpose:** Split long texts into smaller segments for preprocessing.
 - **Main Class:** `TranscriptSegmenter`
 - **Key Methods:**
+  - `segment_by_time()` — Split by time intervals.
+  - `clean_text()` — Remove filler words.
+## JSON Output Structure
+```json
+{
+    "title": "...",
+    "detected_language": "English",
+    "summary": "Overall summary (3-5 sentences)",
+    "segments": [
+        {
+            "title": "Segment title",
+            "summary": "What this section covers",
+            "key_insight": "Most important point",
+            "why_it_matters": "Why this is valuable"
+        }
+    ],
+    "conclusion": "Final takeaway",
+    "topics": ["Topic1", "Topic2"]
+}
+```
+> **Note:** `topics` is hidden metadata — not rendered in markdown, used by downstream modules only.
+## Markdown Output Order
+1. **Source** — video URL
+2. **Duration** — video length
+3. **Overall Summary** — one concise summary
+4. **Timeline** — chronological segments (3-7), each with Key Insight + Why It Matters
+5. **Conclusion** — final takeaway
+## Labels (Localized)
+| Key | English | Arabic |
+|-----|---------|--------|
+| source | Source | المصدر |
+| duration | Duration | المدة |
+| summary | Overall Summary | الملخص العام |
+| timeline | Timeline | التسلسل الزمني |
+| insight | Key Insight | أهم نقطة |
+| why | Why It Matters | لماذا يهم؟ |
+| conclusion | Conclusion | الخلاصة |
 ## Testing
 ```python
+from src.summarization.note_generator import NoteGenerator
 generator = NoteGenerator()
 transcript = "Here is the complete video transcript..."
 title = "Introduction to Python"
 # Generate notes
+summary_json = generator.generateSummary(transcript, title)
+notes_md = generator.format_notes_to_markdown(summary_json)
 print(notes_md)
 ```
 ## Libraries Used
+- `groq` — Communicate with Groq API (Llama-3.3-70b-versatile).
+- `pydantic` — Data validation and schema enforcement.

src/summarization/note_generator.py CHANGED Viewed

@@ -1,101 +1,63 @@
 import json
 import os
-from typing import Dict, List, Literal, Optional
 from groq import Groq
-from pydantic import BaseModel, Field, ValidationError
 from ..utils.logger import setup_logger
 logger = setup_logger(__name__)
-# ─────────────────────────────────────────────────────────────────────────────
-# PYDANTIC SCHEMAS
-# ─────────────────────────────────────────────────────────────────────────────
-class KeyConceptSchema(BaseModel):
-    term: str = Field(description="Concept or term name")
-    definition: str = Field(description="Clear, detailed explanation (2-4 sentences)")
-    importance: str = Field(description="Why this concept matters in context (1-2 sentences)")
-class SegmentSchema(BaseModel):
-    title: str = Field(description="Short, descriptive title for this chronological segment")
-    focus_topic: str = Field(description="1-2 sentence explanation of the central theme or argument covered in this segment")
-    key_points: List[str] = Field(description="3-6 concise bullet-point takeaways extracted from this segment")
-class ActionItemEntrySchema(BaseModel):
-    action: str = Field(description="A specific, actionable takeaway")
-    rationale: str = Field(description="Why the viewer should do this (1 sentence)")
-class SummarySchema(BaseModel):
-    title: str = Field(description="Inferred video title in the transcript language")
-    detected_language: str = Field(description="Detected language of the transcript, stated in English (e.g. 'Arabic', 'English')")
-    hook: str = Field(description="A compelling 2-3 sentence teaser in the transcript language")
-    quick_summary: str = Field(description="A concise, high-level paragraph (3-5 sentences) explaining what the entire video is about, its main thesis, and its value to the viewer.")
-    segments: List[SegmentSchema] = Field(description="3-5 chronological chapters of the video (up to 7 for exceptionally long or dense videos). Must follow the natural progression of the transcript.")
-    key_concepts: List[KeyConceptSchema] = Field(description="3-8 key concepts/terms with definition AND importance")
-    keywords: List[str] = Field(description="5-10 relevant topic tags")
-    category: Literal[
-        "Technology & AI",
-        "Business & Finance",
-        "Education & Science",
-        "Productivity & Self-Growth",
-        "News & Politics",
-        "Entertainment & Lifestyle",
-        "Health & Sports"
-    ] = Field(description="You MUST categorize the content into EXACTLY one of the 7 provided categories. Do not use any other label.")
-    closing_thought: str = Field(description="A motivating or thought-provoking closing sentence in the transcript language")
-class ActionItemsSchema(BaseModel):
-    action_items: List[ActionItemEntrySchema] = Field(description="3-6 specific actionable takeaways, each with action + rationale")
 # ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES (STRICT JSON ENFORCEMENT)
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
-You are an expert educational content analyst and note-taking specialist.
-Transform raw video transcripts into structured, deeply insightful JSON notes.
 LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
 - Detect the primary language of the transcript.
-- Every single field MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
-- Only the `detected_language` field itself is stated in English (e.g. "Arabic").
-SEGMENT RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
-- Standard videos: produce a MINIMUM of 3 and a MAXIMUM of 5 segments.
-- Exceptionally long or content-dense videos ONLY: you may scale up to an absolute maximum of 7 segments.
-- Each segment MUST cover a distinct phase or theme; do NOT repeat the same topic across multiple segments.
 - Segments must be ordered chronologically as they appear in the transcript.
-CRITICAL: YOU MUST RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
-    "title": "Inferred video title",
     "detected_language": "English (or Arabic, etc.)",
-    "hook": "A compelling 2-3 sentence teaser in the transcript language",
-    "quick_summary": "Concise high-level paragraph (3-5 sentences) explaining what the entire video is about, its main thesis, and its value to the viewer — in the transcript language",
     "segments": [
         {
-            "title": "Chronological segment title",
-            "focus_topic": "1-2 sentence description of the central theme covered in this segment",
-            "key_points": ["point 1", "point 2", "point 3"]
-        }
-    ],
-    "key_concepts": [
-        {
-            "term": "Concept name",
-            "definition": "Clear explanation (2-4 sentences)",
-            "importance": "Why it matters (1-2 sentences)"
         }
     ],
-    "keywords": ["tag1", "tag2"],
-    "category": "<EXACTLY one of: Technology & AI | Business & Finance | Education & Science | Productivity & Self-Growth | News & Politics | Entertainment & Lifestyle | Health & Sports>",
-    "closing_thought": "A motivating or thought-provoking closing sentence in the transcript language"
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -108,69 +70,34 @@ TRANSCRIPT:
 {transcript}
 Analyze thoroughly. Detect the language.
-Divide the content into 3-5 chronological segments (max 7 for very long/dense videos).
 Return ONLY the exact JSON structure requested.
 """.strip()
-_ACTIONS_SYSTEM = """
-You are an expert at extracting actionable insights from educational content.
-LANGUAGE RULE — CRITICAL:
-- Detect the primary language of the transcript and output entirely in that language.
-CRITICAL: Return ONLY a valid JSON object EXACTLY matching this structure:
-{
-    "action_items": [
-        {
-            "action": "A specific, actionable takeaway",
-            "rationale": "Why the viewer should do this"
-        }
-    ]
-}
-""".strip()
-_ACTIONS_USER = """
-TRANSCRIPT:
-{transcript}
-Extract actionable takeaways. Return JSON only matching the requested structure.
-""".strip()
 # ─────────────────────────────────────────────────────────────────────────────
-# LANGUAGE LABEL MAPS (For UI localization)
 # ─────────────────────────────────────────────────────────────────────────────
 _LABELS = {
     "Arabic": {
-        "hook":        "لمحة سريعة",
-        "summary":     "الملخص التنفيذي",
-        "core_topics": "المحاور الرئيسية",
-        "concepts":    "المفاهيم الأساسية",
-        "timeline":    "خط الزمن",
-        "actions":     "خطوات عملية",
-        "closing":     "فكرة ختامية",
-        "source":      "المصدر",
-        "duration":    "المدة",
-        "why":         "لماذا؟",
-        "insight":     "💎 نقطة مضيئة",
-        "importance":  "الأهمية",
-        "tags":        "الوسوم",
     },
     "English": {
-        "hook":        "Quick Teaser",
-        "summary":     "Executive Summary",
-        "core_topics": "Core Topics",
-        "concepts":    "Key Concepts",
-        "timeline":    "Timeline",
-        "actions":     "Action Items",
-        "closing":     "Closing Thought",
-        "source":      "Source",
-        "duration":    "Duration",
-        "why":         "Why?",
-        "insight":     "💎 Key Insight",
-        "importance":  "Importance",
-        "tags":        "Tags",
     },
 }
@@ -183,20 +110,20 @@ def _labels(language: str) -> dict:
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
-    """Generates premium structured study notes using Groq (Llama-3.3-70b-versatile)."""
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
         self.model_id = "llama-3.3-70b-versatile"
-        logger.info(f"🚀 NoteGenerator v3.1 initialized — model: {self.model_id}")
     def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
         try:
             response = self.client.chat.completions.create(
                 model=self.model_id,
                 max_tokens=max_tokens,
-                temperature=0.3, # تقليل الإبداع لضمان التزام القالب
                 response_format={"type": "json_object"},
                 messages=[
                     {"role": "system", "content": system},
@@ -211,17 +138,15 @@ class NoteGenerator:
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
-            "quick_summary": f"Could not generate notes: {error_msg}",
-            "key_concepts": [],
-            "segments": [],
-            "keywords": [],
             "detected_language": "English",
-            "hook": "",
-            "category": "Education & Science",
-            "closing_thought": "",
         }
     def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
         if not self.client:
             return self._get_error_json("Groq API Key missing.")
@@ -243,26 +168,8 @@ class NoteGenerator:
             logger.error(f"❌ Schema validation failed: {e}")
             return self._get_error_json(f"Validation Error: {str(e)}")
-    def extractActionItems(self, transcript_text: str, video_title: str) -> Dict:
-        if not self.client:
-            return {"action_items": []}
-        logger.info(f"✅ Action items extraction started via {self.model_id}")
-        user_prompt = _ACTIONS_USER.format(transcript=transcript_text[:20000])
-        raw = self._chat(_ACTIONS_SYSTEM, user_prompt, max_tokens=1024)
-        if raw is None:
-            return {"action_items": []}
-        try:
-            data = json.loads(raw)
-            validated = ActionItemsSchema(**data)
-            return validated.model_dump()
-        except (json.JSONDecodeError, ValidationError) as e:
-            logger.error(f"❌ Action items validation failed: {e}")
-            return {"action_items": []}
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         lang = json_notes.get("detected_language", "English")
         L = _labels(lang)
         lines: list[str] = []
@@ -278,98 +185,43 @@ class NoteGenerator:
             lines.append("---")
             lines.append("")
-        # ── HOOK ──
-        hook = json_notes.get("hook", "")
-        if hook:
-            add(f"## ✨ {L['hook']}")
-            blank()
-            add(f"*{hook}*")
-            divider()
-        # ── SUMMARY (legacy fallback) ──
-        legacy_summary = json_notes.get("summary", "")
-        if legacy_summary and not json_notes.get("quick_summary"):
             add(f"## 📋 {L['summary']}")
             blank()
-            add(legacy_summary)
             divider()
-        # ── QUICK SUMMARY ──
-        quick_summary = json_notes.get("quick_summary", "")
-        if quick_summary:
-            add(f"## 📋 {L['summary']}")
-            blank()
-            add(quick_summary)
-            divider()
-        # ── SEGMENTS / CHAPTERS ──
         segments = json_notes.get("segments", [])
         if segments:
-            add(f"## 🧠 {L['core_topics']}")
             blank()
             for i, seg in enumerate(segments, start=1):
-                s_title  = seg.get("title", "") if isinstance(seg, dict) else seg.title
-                s_focus  = seg.get("focus_topic", "") if isinstance(seg, dict) else seg.focus_topic
-                s_points = seg.get("key_points", []) if isinstance(seg, dict) else seg.key_points
                 add(f"### {i}. {s_title}")
                 blank()
-                add(f"*{s_focus}*")
-                blank()
-                for point in s_points:
-                    add(f"- {point}")
-                blank()
-            divider()
-        # ── KEY CONCEPTS ──
-        key_concepts = json_notes.get("key_concepts", [])
-        if key_concepts:
-            add(f"## 💡 {L['concepts']}")
-            blank()
-            for concept in key_concepts:
-                term       = concept.get("term", "") if isinstance(concept, dict) else concept.term
-                definition = concept.get("definition", "") if isinstance(concept, dict) else concept.definition
-                importance = concept.get("importance", "") if isinstance(concept, dict) else concept.importance
-                add(f"**{term}**")
-                blank()
-                add(definition)
-                blank()
-                if importance:
-                    add(f"> *{L['importance']}: {importance}*")
-                blank()
-            divider()
-        # ── ACTION ITEMS ──
-        action_items = json_notes.get("action_items", [])
-        if action_items:
-            add(f"## 🎯 {L['actions']}")
-            blank()
-            for idx, item in enumerate(action_items, start=1):
-                action    = item.get("action", "") if isinstance(item, dict) else item.action
-                rationale = item.get("rationale", "") if isinstance(item, dict) else item.rationale
-                add(f"**{idx}. {action}**")
-                blank()
-                if rationale:
-                    add(f"> *{L['why']} {rationale}*")
                 blank()
             divider()
-        # ── KEYWORDS ──
-        keywords = json_notes.get("keywords", [])
-        if keywords:
-            add(f"## 🏷️ {L['tags']}")
             blank()
-            add("  ".join([f"`{kw}`" for kw in keywords]))
-            divider()
-        # ── CLOSING THOUGHT ──
-        closing = json_notes.get("closing_thought", "")
-        if closing:
-            add(f"## 🔖 {L['closing']}")
-            blank()
-            add(f"> {closing}")
             blank()
         return "\n".join(lines)
@@ -382,10 +234,11 @@ class NoteGenerator:
         duration: int,
     ) -> str:
         """
-        Wrap the formatted Markdown body with the video header.
-        Fixes the 00:00 duration bug.
         """
-        # إذا كانت المدة صفر أو غير موجودة
         if duration and duration > 0:
             hours   = int(duration // 3600)
             minutes = int((duration % 3600) // 60)
@@ -400,8 +253,8 @@ class NoteGenerator:
         header = (
             f"# {video_title}\n\n"
             f"---\n\n"
-            f"> **Source:** {video_url}  \n"
-            f"> **Duration:** {duration_str}\n\n"
             f"---\n\n"
         )
         return header + notes

 import json
 import os
+from typing import Dict, Optional
 from groq import Groq
+from pydantic import ValidationError
 from ..utils.logger import setup_logger
+from .schemas import SummarySchema
 logger = setup_logger(__name__)
 # ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
+You are an expert educational content analyst and structured note-taking specialist.
+Transform raw video transcripts into clean, structured chronological JSON summaries.
 LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
 - Detect the primary language of the transcript.
+- Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
 - Do NOT mix languages. Arabic transcript -> everything in Arabic.
+- Only the "detected_language" field itself is stated in English (e.g. "Arabic").
+TIMELINE RULES — STRICTLY ENFORCED:
 - Divide the transcript into chronological segments that follow its natural progression.
+- Produce a MINIMUM of 3 and a MAXIMUM of 7 segments.
+- Each segment MUST cover a distinct phase or theme; do NOT repeat the same topic.
 - Segments must be ordered chronologically as they appear in the transcript.
+- Each segment must include:
+  * title: a short descriptive title
+  * summary: concise summary of that section (2-3 sentences)
+  * key_insight: the single most important takeaway from that section
+  * why_it_matters: brief explanation of value/importance (1-2 sentences)
+TOPICS RULE:
+- Extract the actual topics discussed in the video dynamically.
+- Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
+- Do NOT use generic fixed categories.
+CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
 {
+    "title": "Inferred video title in transcript language",
     "detected_language": "English (or Arabic, etc.)",
+    "summary": "Concise overall summary (3-5 sentences)",
     "segments": [
         {
+            "title": "Segment title",
+            "summary": "What this section covers (2-3 sentences)",
+            "key_insight": "Most important point from this section",
+            "why_it_matters": "Why this is valuable (1-2 sentences)"
         }
     ],
+    "conclusion": "Final overall takeaway / closing conclusion",
+    "topics": ["Topic1", "Topic2", "Topic3"]
 }
 OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 {transcript}
 Analyze thoroughly. Detect the language.
+Divide the content into 3-7 chronological segments.
+For each segment provide: title, summary, key_insight, why_it_matters.
 Return ONLY the exact JSON structure requested.
 """.strip()
 # ─────────────────────────────────────────────────────────────────────────────
+# LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
 _LABELS = {
     "Arabic": {
+        "source":     "المصدر",
+        "duration":   "المدة",
+        "summary":    "الملخص العام",
+        "timeline":   "التسلسل الزمني",
+        "insight":    "أهم نقطة",
+        "why":        "لماذا يهم؟",
+        "conclusion": "الخلاصة",
     },
     "English": {
+        "source":     "Source",
+        "duration":   "Duration",
+        "summary":    "Overall Summary",
+        "timeline":   "Timeline",
+        "insight":    "Key Insight",
+        "why":        "Why It Matters",
+        "conclusion": "Conclusion",
     },
 }
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
+    """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
         self.model_id = "llama-3.3-70b-versatile"
+        logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
     def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
         try:
             response = self.client.chat.completions.create(
                 model=self.model_id,
                 max_tokens=max_tokens,
+                temperature=0.3,
                 response_format={"type": "json_object"},
                 messages=[
                     {"role": "system", "content": system},
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
             "detected_language": "English",
+            "summary": f"Could not generate notes: {error_msg}",
+            "segments": [],
+            "conclusion": "",
+            "topics": [],
         }
     def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
+        """Generate structured JSON summary from transcript."""
         if not self.client:
             return self._get_error_json("Groq API Key missing.")
             logger.error(f"❌ Schema validation failed: {e}")
             return self._get_error_json(f"Validation Error: {str(e)}")
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
+        """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")
         L = _labels(lang)
         lines: list[str] = []
             lines.append("---")
             lines.append("")
+        # ── OVERALL SUMMARY ──
+        summary = json_notes.get("summary", "")
+        if summary:
             add(f"## 📋 {L['summary']}")
             blank()
+            add(summary)
             divider()
+        # ── TIMELINE ──
         segments = json_notes.get("segments", [])
         if segments:
+            add(f"## 🕐 {L['timeline']}")
             blank()
             for i, seg in enumerate(segments, start=1):
+                s_title   = seg.get("title", "")   if isinstance(seg, dict) else seg.title
+                s_summary = seg.get("summary", "")  if isinstance(seg, dict) else seg.summary
+                s_insight = seg.get("key_insight", "") if isinstance(seg, dict) else seg.key_insight
+                s_why     = seg.get("why_it_matters", "") if isinstance(seg, dict) else seg.why_it_matters
                 add(f"### {i}. {s_title}")
                 blank()
+                add(s_summary)
                 blank()
+                if s_insight:
+                    add(f"> **💎 {L['insight']}:** {s_insight}")
+                    blank()
+                if s_why:
+                    add(f"> **{L['why']}** {s_why}")
+                    blank()
             divider()
+        # ── CONCLUSION ──
+        conclusion = json_notes.get("conclusion", "")
+        if conclusion:
+            add(f"## 🔖 {L['conclusion']}")
             blank()
+            add(f"> {conclusion}")
             blank()
         return "\n".join(lines)
         duration: int,
     ) -> str:
         """
+        Wrap the formatted Markdown body with Source + Duration header.
         """
+        lang_hint = "English"  # default for header
+        L = _labels(lang_hint)
         if duration and duration > 0:
             hours   = int(duration // 3600)
             minutes = int((duration % 3600) // 60)
         header = (
             f"# {video_title}\n\n"
             f"---\n\n"
+            f"> **{L['source']}:** {video_url}  \n"
+            f"> **{L['duration']}:** {duration_str}\n\n"
             f"---\n\n"
         )
         return header + notes

src/summarization/schemas.py CHANGED Viewed

@@ -1,64 +1,58 @@
-from typing import Annotated, List, Literal
 from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
-# Segment / Chapter schema
 # ---------------------------------------------------------------------------
 class SegmentSchema(BaseModel):
-    """Represents one chronological chapter/batch of the video."""
     title: str = Field(
         ...,
-        description="A short, descriptive title for this chronological segment of the video.",
     )
-    focus_topic: str = Field(
         ...,
-        description=(
-            "A 1–2 sentence explanation of the central theme or argument"
-            " covered in this segment."
-        ),
     )
-    key_points: List[str] = Field(
         ...,
-        min_length=2,
-        description="3–6 concise bullet-point takeaways extracted from this segment.",
     )
 # ---------------------------------------------------------------------------
-# Primary response schema
 # ---------------------------------------------------------------------------
 class SummarySchema(BaseModel):
     """Top-level structured output returned by the LLM summarization call."""
-    title: str = Field(..., description="Inferred title of the video.")
-    quick_summary: str = Field(
         ...,
         description=(
-            "A concise, high-level paragraph (3–5 sentences) that explains"
-            " what the entire video is about, its main thesis, and its value"
-            " to the viewer."
         ),
     )
-    # STRICTLY ENFORCED — do NOT alter the allowed values.
-    category: Literal[
-        "Technology & AI",
-        "Business & Finance",
-        "Education & Science",
-        "Productivity & Self-Growth",
-        "News & Politics",
-        "Entertainment & Lifestyle",
-        "Health & Sports",
-    ] = Field(
         ...,
         description=(
-            "You MUST categorize the content into EXACTLY one of the 7 provided"
-            " categories. Do not use any other label."
         ),
     )
@@ -68,25 +62,23 @@ class SummarySchema(BaseModel):
             min_length=3,
             max_length=7,
             description=(
-                "Chronological chapters of the video. Standard videos MUST have"
-                " 3–5 segments; exceptionally long or dense videos may use up to 7."
-                " Segments must follow the natural progression of the transcript."
             ),
         ),
     ]
-    keywords: List[str] = Field(
         ...,
-        description="5–10 relevant topic tags for categorization and recommendations.",
     )
-# ---------------------------------------------------------------------------
-# Action-items schema (unchanged)
-# ---------------------------------------------------------------------------
-class ActionItemsSchema(BaseModel):
-    action_items: List[str] = Field(
         ...,
-        description="Actionable takeaways or tasks derived from the content.",
     )

+from typing import Annotated, List
 from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
+# Timeline Segment schema
 # ---------------------------------------------------------------------------
 class SegmentSchema(BaseModel):
+    """Represents one chronological section of the video timeline."""
     title: str = Field(
         ...,
+        description="A short, descriptive title for this chronological segment.",
     )
+    summary: str = Field(
         ...,
+        description="A concise summary of what is covered in this segment (2-3 sentences).",
     )
+    key_insight: str = Field(
         ...,
+        description="The single most important point or takeaway from this segment.",
+    )
+    why_it_matters: str = Field(
+        ...,
+        description="Brief explanation of the value or importance of this segment (1-2 sentences).",
     )
 # ---------------------------------------------------------------------------
+# Primary Summary response schema
 # ---------------------------------------------------------------------------
 class SummarySchema(BaseModel):
     """Top-level structured output returned by the LLM summarization call."""
+    title: str = Field(
+        ...,
+        description="Inferred title of the video in the transcript language.",
+    )
+    detected_language: str = Field(
         ...,
         description=(
+            "Detected language of the transcript, stated in English"
+            " (e.g. 'Arabic', 'English')."
         ),
     )
+    summary: str = Field(
         ...,
         description=(
+            "A concise, high-level paragraph (3-5 sentences) that explains"
+            " what the entire video is about, its main thesis, and its value"
+            " to the viewer."
         ),
     )
             min_length=3,
             max_length=7,
             description=(
+                "Chronological timeline sections of the video. Minimum 3,"
+                " maximum 7. Must follow the natural progression of the transcript."
             ),
         ),
     ]
+    conclusion: str = Field(
         ...,
+        description="A final overall takeaway or closing conclusion in the transcript language.",
     )
+    # Hidden metadata — not rendered in markdown, used by downstream modules.
+    topics: List[str] = Field(
         ...,
+        min_length=1,
+        description=(
+            "Dynamically extracted topics discussed in the video."
+            " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
+        ),
     )