amanyelfiky commited on
Commit
3c66ec8
·
1 Parent(s): 97312dd

Refactor summarization module and update schemas

Browse files
src/api/main.py CHANGED
@@ -54,7 +54,7 @@ class TaskStatusResponse(BaseModel):
54
  user_id: str | None = None
55
  created_at: datetime | None = None
56
  notes: str | None = None
57
- category: str | None = "Uncategorized"
58
  keyPoints: list | None = []
59
 
60
 
@@ -221,18 +221,9 @@ async def process_video_and_save(
221
  tasks[task_id]["status"] = TaskStatus.GENERATING_NOTES
222
  note_gen = NoteGenerator()
223
  summary_json = note_gen.generateSummary(transcript_text, video_info["title"])
224
-
225
- # 3. Extract Action Items
226
- action_items_json = note_gen.extractActionItems(transcript_text, video_info["title"])
227
 
228
- # Combine results
229
- combined_json = {
230
- **summary_json,
231
- "action_items": action_items_json.get("action_items", [])
232
- }
233
-
234
  final_notes = note_gen.format_final_notes(
235
- note_gen.format_notes_to_markdown(combined_json),
236
  video_info["title"],
237
  youtube_url,
238
  video_info["duration"],
@@ -245,7 +236,7 @@ async def process_video_and_save(
245
  "video_url": youtube_url,
246
  "video_title": video_info["title"],
247
  "summary_content": final_notes,
248
- "category": combined_json.get("category", "Uncategorized"),
249
  "created_at": datetime.utcnow()
250
  }
251
  db.collection("notes").add(note_data)
@@ -253,17 +244,15 @@ async def process_video_and_save(
253
  logger.warning("Firestore not initialized, note not saved to DB but generated in memory.")
254
 
255
  tasks[task_id]["notes"] = final_notes
256
- tasks[task_id]["category"] = combined_json.get("category", "Uncategorized")
257
-
258
- # safely extract strings from KeyConcept objects for Flutter frontend
259
- key_concepts = combined_json.get("key_concepts", [])
260
  key_points_list = []
261
- for kc in key_concepts:
262
- if isinstance(kc, dict):
263
- key_points_list.append(f"{kc.get('term', '')}: {kc.get('definition', '')}")
264
- else:
265
- key_points_list.append(str(kc))
266
-
267
  tasks[task_id]["keyPoints"] = key_points_list
268
  tasks[task_id]["status"] = TaskStatus.COMPLETED
269
  except Exception as e:
 
54
  user_id: str | None = None
55
  created_at: datetime | None = None
56
  notes: str | None = None
57
+ topics: list | None = []
58
  keyPoints: list | None = []
59
 
60
 
 
221
  tasks[task_id]["status"] = TaskStatus.GENERATING_NOTES
222
  note_gen = NoteGenerator()
223
  summary_json = note_gen.generateSummary(transcript_text, video_info["title"])
 
 
 
224
 
 
 
 
 
 
 
225
  final_notes = note_gen.format_final_notes(
226
+ note_gen.format_notes_to_markdown(summary_json),
227
  video_info["title"],
228
  youtube_url,
229
  video_info["duration"],
 
236
  "video_url": youtube_url,
237
  "video_title": video_info["title"],
238
  "summary_content": final_notes,
239
+ "topics": summary_json.get("topics", []),
240
  "created_at": datetime.utcnow()
241
  }
242
  db.collection("notes").add(note_data)
 
244
  logger.warning("Firestore not initialized, note not saved to DB but generated in memory.")
245
 
246
  tasks[task_id]["notes"] = final_notes
247
+ tasks[task_id]["topics"] = summary_json.get("topics", [])
248
+
249
+ # Extract key insights from segments for Flutter frontend
250
+ segments = summary_json.get("segments", [])
251
  key_points_list = []
252
+ for seg in segments:
253
+ if isinstance(seg, dict) and seg.get("key_insight"):
254
+ key_points_list.append(seg["key_insight"])
255
+
 
 
256
  tasks[task_id]["keyPoints"] = key_points_list
257
  tasks[task_id]["status"] = TaskStatus.COMPLETED
258
  except Exception as e:
src/api/notes_routes.py CHANGED
@@ -282,19 +282,13 @@ async def process_video_task(task_id: str, youtube_url: str, language: str, user
282
  tasks[task_id]["status"] = "generating_notes"
283
  note_gen = NoteGenerator()
284
  summary_json = note_gen.generateSummary(transcript_text, video_title)
285
- action_items = note_gen.extractActionItems(transcript_text, video_title)
286
-
287
- combined_notes = {
288
- **summary_json,
289
- "action_items": action_items.get("action_items", [])
290
- }
291
 
292
  # استخراج مدة الفيديو الحقيقية بدلاً من الصفر المبرمج
293
  tasks[task_id]["message"] = "Fetching video metadata..."
294
  video_duration = get_youtube_duration(youtube_url)
295
 
296
  final_markdown = note_gen.format_final_notes(
297
- note_gen.format_notes_to_markdown(combined_notes),
298
  video_title,
299
  youtube_url,
300
  video_duration,
 
282
  tasks[task_id]["status"] = "generating_notes"
283
  note_gen = NoteGenerator()
284
  summary_json = note_gen.generateSummary(transcript_text, video_title)
 
 
 
 
 
 
285
 
286
  # استخراج مدة الفيديو الحقيقية بدلاً من الصفر المبرمج
287
  tasks[task_id]["message"] = "Fetching video metadata..."
288
  video_duration = get_youtube_duration(youtube_url)
289
 
290
  final_markdown = note_gen.format_final_notes(
291
+ note_gen.format_notes_to_markdown(summary_json),
292
  video_title,
293
  youtube_url,
294
  video_duration,
src/summarization/README.md CHANGED
@@ -1,67 +1,94 @@
1
  # Summarization Module 📝
2
 
3
  ## Responsibility
4
- This module handles **text summarization and conversion to study notes**.
5
 
6
  ## Functionality
7
  1. Receive transcribed text from videos.
8
- 2. Use **Google Gemini** to analyze text and convert it to organized notes.
9
- 3. Create a Markdown file containing:
10
- - General summary
11
- - Key concepts
12
- - Timeline
13
- - Action items
14
 
15
  ## Files
16
 
17
- ### 1. `note_generator.py`
18
- - **Purpose:** Generate notes using Gemini AI.
 
 
 
 
 
 
19
  - **Main Class:** `NoteGenerator`
20
  - **Key Methods:**
21
- - `generate_notes_json(transcript, title)` - Generates structured JSON.
22
- - `format_notes_to_markdown(json_notes)` - Converts JSON to Markdown.
23
-
24
- ### 2. `schemas.py`
25
- - **Purpose:** Define data structure (Schema) for notes.
26
- - **Main Class:** `StudyNoteSchema`
27
- - **Fields:**
28
- - `summary` - General summary.
29
- - `key_concepts` - List of concepts and definitions.
30
- - `timestamps` - Timeline of topics.
31
- - `action_items` - Suggested tasks or exercises.
32
 
33
  ### 3. `segmenter.py`
34
- - **Purpose:** Split long texts into smaller segments.
35
  - **Main Class:** `TranscriptSegmenter`
36
  - **Key Methods:**
37
- - `segment_by_time()` - Split by time (e.g., every 5 minutes).
38
- - `clean_text()` - Remove filler words (um, uh, like).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- ## Proposed Enhancements
41
- - [ ] Add support for diagrams and illustrations.
42
- - [ ] Improve prompts for more detailed summaries.
43
- - [ ] Add translation feature to Arabic.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  ## Testing
46
  ```python
47
- from src.ai_modules.summarization.note_generator import NoteGenerator
48
 
49
  generator = NoteGenerator()
50
  transcript = "Here is the complete video transcript..."
51
  title = "Introduction to Python"
52
 
53
  # Generate notes
54
- notes_json = generator.generate_notes_json(transcript, title)
55
- notes_md = generator.format_notes_to_markdown(notes_json)
56
 
57
  print(notes_md)
58
  ```
59
 
60
  ## Libraries Used
61
- - `google-genai` - Communicate with Google Gemini.
62
- - `pydantic` - Data validation.
63
-
64
- ## Important Notes
65
- - Currently using `gemini-flash-latest` model.
66
- - Summary quality can be improved by modifying the `SYSTEM_PROMPT`.
67
- - The Schema ensures the output is always in valid JSON format.
 
1
  # Summarization Module 📝
2
 
3
  ## Responsibility
4
+ This module handles **text summarization and conversion to structured study notes**.
5
 
6
  ## Functionality
7
  1. Receive transcribed text from videos.
8
+ 2. Use **Groq (Llama-3.3-70b-versatile)** to analyze text and generate structured JSON notes.
9
+ 3. Produce clean Markdown output with:
10
+ - Source & Duration header
11
+ - Overall Summary
12
+ - Chronological Timeline (3-7 segments with Key Insight + Why It Matters)
13
+ - Conclusion
14
 
15
  ## Files
16
 
17
+ ### 1. `schemas.py`
18
+ - **Purpose:** Single source of truth for all Pydantic data models.
19
+ - **Key Classes:**
20
+ - `SummarySchema` — Full structured output (title, detected_language, summary, segments, conclusion, topics).
21
+ - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
22
+
23
+ ### 2. `note_generator.py`
24
+ - **Purpose:** Generate notes using Groq AI with strict JSON enforcement.
25
  - **Main Class:** `NoteGenerator`
26
  - **Key Methods:**
27
+ - `generateSummary(transcript, title)` Generates structured JSON study notes.
28
+ - `format_notes_to_markdown(json_notes)` Converts JSON to clean Markdown.
29
+ - `format_final_notes(notes, title, url, duration)` — Wraps Markdown with Source/Duration header.
 
 
 
 
 
 
 
 
30
 
31
  ### 3. `segmenter.py`
32
+ - **Purpose:** Split long texts into smaller segments for preprocessing.
33
  - **Main Class:** `TranscriptSegmenter`
34
  - **Key Methods:**
35
+ - `segment_by_time()` Split by time intervals.
36
+ - `clean_text()` Remove filler words.
37
+
38
+ ## JSON Output Structure
39
+ ```json
40
+ {
41
+ "title": "...",
42
+ "detected_language": "English",
43
+ "summary": "Overall summary (3-5 sentences)",
44
+ "segments": [
45
+ {
46
+ "title": "Segment title",
47
+ "summary": "What this section covers",
48
+ "key_insight": "Most important point",
49
+ "why_it_matters": "Why this is valuable"
50
+ }
51
+ ],
52
+ "conclusion": "Final takeaway",
53
+ "topics": ["Topic1", "Topic2"]
54
+ }
55
+ ```
56
 
57
+ > **Note:** `topics` is hidden metadata — not rendered in markdown, used by downstream modules only.
58
+
59
+ ## Markdown Output Order
60
+ 1. **Source** video URL
61
+ 2. **Duration** — video length
62
+ 3. **Overall Summary** — one concise summary
63
+ 4. **Timeline** — chronological segments (3-7), each with Key Insight + Why It Matters
64
+ 5. **Conclusion** — final takeaway
65
+
66
+ ## Labels (Localized)
67
+ | Key | English | Arabic |
68
+ |-----|---------|--------|
69
+ | source | Source | المصدر |
70
+ | duration | Duration | المدة |
71
+ | summary | Overall Summary | الملخص العام |
72
+ | timeline | Timeline | التسلسل الزمني |
73
+ | insight | Key Insight | أهم نقطة |
74
+ | why | Why It Matters | لماذا يهم؟ |
75
+ | conclusion | Conclusion | الخلاصة |
76
 
77
  ## Testing
78
  ```python
79
+ from src.summarization.note_generator import NoteGenerator
80
 
81
  generator = NoteGenerator()
82
  transcript = "Here is the complete video transcript..."
83
  title = "Introduction to Python"
84
 
85
  # Generate notes
86
+ summary_json = generator.generateSummary(transcript, title)
87
+ notes_md = generator.format_notes_to_markdown(summary_json)
88
 
89
  print(notes_md)
90
  ```
91
 
92
  ## Libraries Used
93
+ - `groq` Communicate with Groq API (Llama-3.3-70b-versatile).
94
+ - `pydantic` Data validation and schema enforcement.
 
 
 
 
 
src/summarization/note_generator.py CHANGED
@@ -1,101 +1,63 @@
1
  import json
2
  import os
3
- from typing import Dict, List, Literal, Optional
4
 
5
  from groq import Groq
6
- from pydantic import BaseModel, Field, ValidationError
7
 
8
  from ..utils.logger import setup_logger
 
9
 
10
 
11
  logger = setup_logger(__name__)
12
 
13
- # ─────────────────────────────────────────────────────────────────────────────
14
- # PYDANTIC SCHEMAS
15
- # ─────────────────────────────────────────────────────────────────────────────
16
-
17
- class KeyConceptSchema(BaseModel):
18
- term: str = Field(description="Concept or term name")
19
- definition: str = Field(description="Clear, detailed explanation (2-4 sentences)")
20
- importance: str = Field(description="Why this concept matters in context (1-2 sentences)")
21
-
22
- class SegmentSchema(BaseModel):
23
- title: str = Field(description="Short, descriptive title for this chronological segment")
24
- focus_topic: str = Field(description="1-2 sentence explanation of the central theme or argument covered in this segment")
25
- key_points: List[str] = Field(description="3-6 concise bullet-point takeaways extracted from this segment")
26
-
27
- class ActionItemEntrySchema(BaseModel):
28
- action: str = Field(description="A specific, actionable takeaway")
29
- rationale: str = Field(description="Why the viewer should do this (1 sentence)")
30
-
31
- class SummarySchema(BaseModel):
32
- title: str = Field(description="Inferred video title in the transcript language")
33
- detected_language: str = Field(description="Detected language of the transcript, stated in English (e.g. 'Arabic', 'English')")
34
- hook: str = Field(description="A compelling 2-3 sentence teaser in the transcript language")
35
- quick_summary: str = Field(description="A concise, high-level paragraph (3-5 sentences) explaining what the entire video is about, its main thesis, and its value to the viewer.")
36
- segments: List[SegmentSchema] = Field(description="3-5 chronological chapters of the video (up to 7 for exceptionally long or dense videos). Must follow the natural progression of the transcript.")
37
- key_concepts: List[KeyConceptSchema] = Field(description="3-8 key concepts/terms with definition AND importance")
38
- keywords: List[str] = Field(description="5-10 relevant topic tags")
39
- category: Literal[
40
- "Technology & AI",
41
- "Business & Finance",
42
- "Education & Science",
43
- "Productivity & Self-Growth",
44
- "News & Politics",
45
- "Entertainment & Lifestyle",
46
- "Health & Sports"
47
- ] = Field(description="You MUST categorize the content into EXACTLY one of the 7 provided categories. Do not use any other label.")
48
- closing_thought: str = Field(description="A motivating or thought-provoking closing sentence in the transcript language")
49
-
50
- class ActionItemsSchema(BaseModel):
51
- action_items: List[ActionItemEntrySchema] = Field(description="3-6 specific actionable takeaways, each with action + rationale")
52
-
53
 
54
  # ─────────────────────────────────────────────────────────────────────────────
55
- # PROMPT TEMPLATES (STRICT JSON ENFORCEMENT)
56
  # ─────────────────────────────────────────────────────────────────────────────
57
 
58
  _SUMMARY_SYSTEM = """
59
- You are an expert educational content analyst and note-taking specialist.
60
- Transform raw video transcripts into structured, deeply insightful JSON notes.
61
 
62
  LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
63
  - Detect the primary language of the transcript.
64
- - Every single field MUST be written entirely in that SAME detected language.
65
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
66
- - Only the `detected_language` field itself is stated in English (e.g. "Arabic").
67
 
68
- SEGMENT RULES — STRICTLY ENFORCED:
69
  - Divide the transcript into chronological segments that follow its natural progression.
70
- - Standard videos: produce a MINIMUM of 3 and a MAXIMUM of 5 segments.
71
- - Exceptionally long or content-dense videos ONLY: you may scale up to an absolute maximum of 7 segments.
72
- - Each segment MUST cover a distinct phase or theme; do NOT repeat the same topic across multiple segments.
73
  - Segments must be ordered chronologically as they appear in the transcript.
74
-
75
- CRITICAL: YOU MUST RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
 
 
 
 
 
 
 
 
 
 
76
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
77
  {
78
- "title": "Inferred video title",
79
  "detected_language": "English (or Arabic, etc.)",
80
- "hook": "A compelling 2-3 sentence teaser in the transcript language",
81
- "quick_summary": "Concise high-level paragraph (3-5 sentences) explaining what the entire video is about, its main thesis, and its value to the viewer — in the transcript language",
82
  "segments": [
83
  {
84
- "title": "Chronological segment title",
85
- "focus_topic": "1-2 sentence description of the central theme covered in this segment",
86
- "key_points": ["point 1", "point 2", "point 3"]
87
- }
88
- ],
89
- "key_concepts": [
90
- {
91
- "term": "Concept name",
92
- "definition": "Clear explanation (2-4 sentences)",
93
- "importance": "Why it matters (1-2 sentences)"
94
  }
95
  ],
96
- "keywords": ["tag1", "tag2"],
97
- "category": "<EXACTLY one of: Technology & AI | Business & Finance | Education & Science | Productivity & Self-Growth | News & Politics | Entertainment & Lifestyle | Health & Sports>",
98
- "closing_thought": "A motivating or thought-provoking closing sentence in the transcript language"
99
  }
100
 
101
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -108,69 +70,34 @@ TRANSCRIPT:
108
  {transcript}
109
 
110
  Analyze thoroughly. Detect the language.
111
- Divide the content into 3-5 chronological segments (max 7 for very long/dense videos).
 
112
  Return ONLY the exact JSON structure requested.
113
  """.strip()
114
 
115
- _ACTIONS_SYSTEM = """
116
- You are an expert at extracting actionable insights from educational content.
117
-
118
- LANGUAGE RULE — CRITICAL:
119
- - Detect the primary language of the transcript and output entirely in that language.
120
-
121
- CRITICAL: Return ONLY a valid JSON object EXACTLY matching this structure:
122
- {
123
- "action_items": [
124
- {
125
- "action": "A specific, actionable takeaway",
126
- "rationale": "Why the viewer should do this"
127
- }
128
- ]
129
- }
130
- """.strip()
131
-
132
- _ACTIONS_USER = """
133
- TRANSCRIPT:
134
- {transcript}
135
-
136
- Extract actionable takeaways. Return JSON only matching the requested structure.
137
- """.strip()
138
-
139
 
140
  # ─────────────────────────────────────────────────────────────────────────────
141
- # LANGUAGE LABEL MAPS (For UI localization)
142
  # ─────────────────────────────────────────────────────────────────────────────
143
 
144
  _LABELS = {
145
  "Arabic": {
146
- "hook": "لمحة سريعة",
147
- "summary": "الملخص التنفيذي",
148
- "core_topics": "المحاور الرئيسية",
149
- "concepts": "المفاهيم الأساسية",
150
- "timeline": "خط الزمن",
151
- "actions": "خطوات عملية",
152
- "closing": "فكرة ختامية",
153
- "source": "المصدر",
154
- "duration": "المدة",
155
- "why": "لماذا؟",
156
- "insight": "💎 نقطة مضيئة",
157
- "importance": "الأهمية",
158
- "tags": "الوسوم",
159
  },
160
  "English": {
161
- "hook": "Quick Teaser",
162
- "summary": "Executive Summary",
163
- "core_topics": "Core Topics",
164
- "concepts": "Key Concepts",
165
- "timeline": "Timeline",
166
- "actions": "Action Items",
167
- "closing": "Closing Thought",
168
- "source": "Source",
169
- "duration": "Duration",
170
- "why": "Why?",
171
- "insight": "💎 Key Insight",
172
- "importance": "Importance",
173
- "tags": "Tags",
174
  },
175
  }
176
 
@@ -183,20 +110,20 @@ def _labels(language: str) -> dict:
183
  # ─────────────────────────────────────────────────────────────────────────────
184
 
185
  class NoteGenerator:
186
- """Generates premium structured study notes using Groq (Llama-3.3-70b-versatile)."""
187
 
188
  def __init__(self):
189
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
190
  self.client = Groq(api_key=self.api_key) if self.api_key else None
191
  self.model_id = "llama-3.3-70b-versatile"
192
- logger.info(f"🚀 NoteGenerator v3.1 initialized — model: {self.model_id}")
193
 
194
  def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
195
  try:
196
  response = self.client.chat.completions.create(
197
  model=self.model_id,
198
  max_tokens=max_tokens,
199
- temperature=0.3, # تقليل الإبداع لضمان التزام القالب
200
  response_format={"type": "json_object"},
201
  messages=[
202
  {"role": "system", "content": system},
@@ -211,17 +138,15 @@ class NoteGenerator:
211
  def _get_error_json(self, error_msg: str) -> Dict:
212
  return {
213
  "title": "Error in Generation",
214
- "quick_summary": f"Could not generate notes: {error_msg}",
215
- "key_concepts": [],
216
- "segments": [],
217
- "keywords": [],
218
  "detected_language": "English",
219
- "hook": "",
220
- "category": "Education & Science",
221
- "closing_thought": "",
 
222
  }
223
 
224
  def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
 
225
  if not self.client:
226
  return self._get_error_json("Groq API Key missing.")
227
 
@@ -243,26 +168,8 @@ class NoteGenerator:
243
  logger.error(f"❌ Schema validation failed: {e}")
244
  return self._get_error_json(f"Validation Error: {str(e)}")
245
 
246
- def extractActionItems(self, transcript_text: str, video_title: str) -> Dict:
247
- if not self.client:
248
- return {"action_items": []}
249
-
250
- logger.info(f"✅ Action items extraction started via {self.model_id}")
251
- user_prompt = _ACTIONS_USER.format(transcript=transcript_text[:20000])
252
-
253
- raw = self._chat(_ACTIONS_SYSTEM, user_prompt, max_tokens=1024)
254
- if raw is None:
255
- return {"action_items": []}
256
-
257
- try:
258
- data = json.loads(raw)
259
- validated = ActionItemsSchema(**data)
260
- return validated.model_dump()
261
- except (json.JSONDecodeError, ValidationError) as e:
262
- logger.error(f"❌ Action items validation failed: {e}")
263
- return {"action_items": []}
264
-
265
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
 
266
  lang = json_notes.get("detected_language", "English")
267
  L = _labels(lang)
268
  lines: list[str] = []
@@ -278,98 +185,43 @@ class NoteGenerator:
278
  lines.append("---")
279
  lines.append("")
280
 
281
- # ── HOOK ──
282
- hook = json_notes.get("hook", "")
283
- if hook:
284
- add(f"## ✨ {L['hook']}")
285
- blank()
286
- add(f"*{hook}*")
287
- divider()
288
-
289
- # ── SUMMARY (legacy fallback) ──
290
- legacy_summary = json_notes.get("summary", "")
291
- if legacy_summary and not json_notes.get("quick_summary"):
292
  add(f"## 📋 {L['summary']}")
293
  blank()
294
- add(legacy_summary)
295
  divider()
296
 
297
- # ── QUICK SUMMARY ──
298
- quick_summary = json_notes.get("quick_summary", "")
299
- if quick_summary:
300
- add(f"## 📋 {L['summary']}")
301
- blank()
302
- add(quick_summary)
303
- divider()
304
-
305
- # ── SEGMENTS / CHAPTERS ──
306
  segments = json_notes.get("segments", [])
307
  if segments:
308
- add(f"## 🧠 {L['core_topics']}")
309
  blank()
310
  for i, seg in enumerate(segments, start=1):
311
- s_title = seg.get("title", "") if isinstance(seg, dict) else seg.title
312
- s_focus = seg.get("focus_topic", "") if isinstance(seg, dict) else seg.focus_topic
313
- s_points = seg.get("key_points", []) if isinstance(seg, dict) else seg.key_points
 
314
 
315
  add(f"### {i}. {s_title}")
316
  blank()
317
- add(f"*{s_focus}*")
318
- blank()
319
- for point in s_points:
320
- add(f"- {point}")
321
- blank()
322
- divider()
323
-
324
- # ── KEY CONCEPTS ──
325
- key_concepts = json_notes.get("key_concepts", [])
326
- if key_concepts:
327
- add(f"## 💡 {L['concepts']}")
328
- blank()
329
- for concept in key_concepts:
330
- term = concept.get("term", "") if isinstance(concept, dict) else concept.term
331
- definition = concept.get("definition", "") if isinstance(concept, dict) else concept.definition
332
- importance = concept.get("importance", "") if isinstance(concept, dict) else concept.importance
333
-
334
- add(f"**{term}**")
335
- blank()
336
- add(definition)
337
- blank()
338
- if importance:
339
- add(f"> *{L['importance']}: {importance}*")
340
- blank()
341
- divider()
342
-
343
- # ── ACTION ITEMS ──
344
- action_items = json_notes.get("action_items", [])
345
- if action_items:
346
- add(f"## 🎯 {L['actions']}")
347
- blank()
348
- for idx, item in enumerate(action_items, start=1):
349
- action = item.get("action", "") if isinstance(item, dict) else item.action
350
- rationale = item.get("rationale", "") if isinstance(item, dict) else item.rationale
351
-
352
- add(f"**{idx}. {action}**")
353
- blank()
354
- if rationale:
355
- add(f"> *{L['why']} {rationale}*")
356
  blank()
 
 
 
 
 
 
357
  divider()
358
 
359
- # ── KEYWORDS ──
360
- keywords = json_notes.get("keywords", [])
361
- if keywords:
362
- add(f"## 🏷️ {L['tags']}")
363
  blank()
364
- add(" ".join([f"`{kw}`" for kw in keywords]))
365
- divider()
366
-
367
- # ── CLOSING THOUGHT ──
368
- closing = json_notes.get("closing_thought", "")
369
- if closing:
370
- add(f"## 🔖 {L['closing']}")
371
- blank()
372
- add(f"> {closing}")
373
  blank()
374
 
375
  return "\n".join(lines)
@@ -382,10 +234,11 @@ class NoteGenerator:
382
  duration: int,
383
  ) -> str:
384
  """
385
- Wrap the formatted Markdown body with the video header.
386
- Fixes the 00:00 duration bug.
387
  """
388
- # إذا كانت المدة صفر أو غير موجودة
 
 
389
  if duration and duration > 0:
390
  hours = int(duration // 3600)
391
  minutes = int((duration % 3600) // 60)
@@ -400,8 +253,8 @@ class NoteGenerator:
400
  header = (
401
  f"# {video_title}\n\n"
402
  f"---\n\n"
403
- f"> **Source:** {video_url} \n"
404
- f"> **Duration:** {duration_str}\n\n"
405
  f"---\n\n"
406
  )
407
  return header + notes
 
1
  import json
2
  import os
3
+ from typing import Dict, Optional
4
 
5
  from groq import Groq
6
+ from pydantic import ValidationError
7
 
8
  from ..utils.logger import setup_logger
9
+ from .schemas import SummarySchema
10
 
11
 
12
  logger = setup_logger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # ─────────────────────────────────────────────────────────────────────────────
16
+ # PROMPT TEMPLATES
17
  # ─────────────────────────────────────────────────────────────────────────────
18
 
19
  _SUMMARY_SYSTEM = """
20
+ You are an expert educational content analyst and structured note-taking specialist.
21
+ Transform raw video transcripts into clean, structured chronological JSON summaries.
22
 
23
  LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
24
  - Detect the primary language of the transcript.
25
+ - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
26
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
27
+ - Only the "detected_language" field itself is stated in English (e.g. "Arabic").
28
 
29
+ TIMELINE RULES — STRICTLY ENFORCED:
30
  - Divide the transcript into chronological segments that follow its natural progression.
31
+ - Produce a MINIMUM of 3 and a MAXIMUM of 7 segments.
32
+ - Each segment MUST cover a distinct phase or theme; do NOT repeat the same topic.
 
33
  - Segments must be ordered chronologically as they appear in the transcript.
34
+ - Each segment must include:
35
+ * title: a short descriptive title
36
+ * summary: concise summary of that section (2-3 sentences)
37
+ * key_insight: the single most important takeaway from that section
38
+ * why_it_matters: brief explanation of value/importance (1-2 sentences)
39
+
40
+ TOPICS RULE:
41
+ - Extract the actual topics discussed in the video dynamically.
42
+ - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
43
+ - Do NOT use generic fixed categories.
44
+
45
+ CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
46
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
47
  {
48
+ "title": "Inferred video title in transcript language",
49
  "detected_language": "English (or Arabic, etc.)",
50
+ "summary": "Concise overall summary (3-5 sentences)",
 
51
  "segments": [
52
  {
53
+ "title": "Segment title",
54
+ "summary": "What this section covers (2-3 sentences)",
55
+ "key_insight": "Most important point from this section",
56
+ "why_it_matters": "Why this is valuable (1-2 sentences)"
 
 
 
 
 
 
57
  }
58
  ],
59
+ "conclusion": "Final overall takeaway / closing conclusion",
60
+ "topics": ["Topic1", "Topic2", "Topic3"]
 
61
  }
62
 
63
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 
70
  {transcript}
71
 
72
  Analyze thoroughly. Detect the language.
73
+ Divide the content into 3-7 chronological segments.
74
+ For each segment provide: title, summary, key_insight, why_it_matters.
75
  Return ONLY the exact JSON structure requested.
76
  """.strip()
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # ─────────────────────────────────────────────────────────────────────────────
80
+ # LANGUAGE LABELS (simplified)
81
  # ─────────────────────────────────────────────────────────────────────────────
82
 
83
  _LABELS = {
84
  "Arabic": {
85
+ "source": "المصدر",
86
+ "duration": "المدة",
87
+ "summary": "الملخص العام",
88
+ "timeline": "التسلسل الزمني",
89
+ "insight": "أهم نقطة",
90
+ "why": "لماذا يهم؟",
91
+ "conclusion": "الخلاصة",
 
 
 
 
 
 
92
  },
93
  "English": {
94
+ "source": "Source",
95
+ "duration": "Duration",
96
+ "summary": "Overall Summary",
97
+ "timeline": "Timeline",
98
+ "insight": "Key Insight",
99
+ "why": "Why It Matters",
100
+ "conclusion": "Conclusion",
 
 
 
 
 
 
101
  },
102
  }
103
 
 
110
  # ─────────────────────────────────────────────────────────────────────────────
111
 
112
  class NoteGenerator:
113
+ """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
114
 
115
  def __init__(self):
116
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
117
  self.client = Groq(api_key=self.api_key) if self.api_key else None
118
  self.model_id = "llama-3.3-70b-versatile"
119
+ logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
120
 
121
  def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
122
  try:
123
  response = self.client.chat.completions.create(
124
  model=self.model_id,
125
  max_tokens=max_tokens,
126
+ temperature=0.3,
127
  response_format={"type": "json_object"},
128
  messages=[
129
  {"role": "system", "content": system},
 
138
  def _get_error_json(self, error_msg: str) -> Dict:
139
  return {
140
  "title": "Error in Generation",
 
 
 
 
141
  "detected_language": "English",
142
+ "summary": f"Could not generate notes: {error_msg}",
143
+ "segments": [],
144
+ "conclusion": "",
145
+ "topics": [],
146
  }
147
 
148
  def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
149
+ """Generate structured JSON summary from transcript."""
150
  if not self.client:
151
  return self._get_error_json("Groq API Key missing.")
152
 
 
168
  logger.error(f"❌ Schema validation failed: {e}")
169
  return self._get_error_json(f"Validation Error: {str(e)}")
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
172
+ """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
173
  lang = json_notes.get("detected_language", "English")
174
  L = _labels(lang)
175
  lines: list[str] = []
 
185
  lines.append("---")
186
  lines.append("")
187
 
188
+ # ── OVERALL SUMMARY ──
189
+ summary = json_notes.get("summary", "")
190
+ if summary:
 
 
 
 
 
 
 
 
191
  add(f"## 📋 {L['summary']}")
192
  blank()
193
+ add(summary)
194
  divider()
195
 
196
+ # ── TIMELINE ──
 
 
 
 
 
 
 
 
197
  segments = json_notes.get("segments", [])
198
  if segments:
199
+ add(f"## 🕐 {L['timeline']}")
200
  blank()
201
  for i, seg in enumerate(segments, start=1):
202
+ s_title = seg.get("title", "") if isinstance(seg, dict) else seg.title
203
+ s_summary = seg.get("summary", "") if isinstance(seg, dict) else seg.summary
204
+ s_insight = seg.get("key_insight", "") if isinstance(seg, dict) else seg.key_insight
205
+ s_why = seg.get("why_it_matters", "") if isinstance(seg, dict) else seg.why_it_matters
206
 
207
  add(f"### {i}. {s_title}")
208
  blank()
209
+ add(s_summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  blank()
211
+ if s_insight:
212
+ add(f"> **💎 {L['insight']}:** {s_insight}")
213
+ blank()
214
+ if s_why:
215
+ add(f"> **{L['why']}** {s_why}")
216
+ blank()
217
  divider()
218
 
219
+ # ── CONCLUSION ──
220
+ conclusion = json_notes.get("conclusion", "")
221
+ if conclusion:
222
+ add(f"## 🔖 {L['conclusion']}")
223
  blank()
224
+ add(f"> {conclusion}")
 
 
 
 
 
 
 
 
225
  blank()
226
 
227
  return "\n".join(lines)
 
234
  duration: int,
235
  ) -> str:
236
  """
237
+ Wrap the formatted Markdown body with Source + Duration header.
 
238
  """
239
+ lang_hint = "English" # default for header
240
+ L = _labels(lang_hint)
241
+
242
  if duration and duration > 0:
243
  hours = int(duration // 3600)
244
  minutes = int((duration % 3600) // 60)
 
253
  header = (
254
  f"# {video_title}\n\n"
255
  f"---\n\n"
256
+ f"> **{L['source']}:** {video_url} \n"
257
+ f"> **{L['duration']}:** {duration_str}\n\n"
258
  f"---\n\n"
259
  )
260
  return header + notes
src/summarization/schemas.py CHANGED
@@ -1,64 +1,58 @@
1
- from typing import Annotated, List, Literal
2
  from pydantic import BaseModel, Field
3
 
4
 
5
  # ---------------------------------------------------------------------------
6
- # Segment / Chapter schema
7
  # ---------------------------------------------------------------------------
8
 
9
  class SegmentSchema(BaseModel):
10
- """Represents one chronological chapter/batch of the video."""
11
 
12
  title: str = Field(
13
  ...,
14
- description="A short, descriptive title for this chronological segment of the video.",
15
  )
16
- focus_topic: str = Field(
17
  ...,
18
- description=(
19
- "A 1–2 sentence explanation of the central theme or argument"
20
- " covered in this segment."
21
- ),
22
  )
23
- key_points: List[str] = Field(
24
  ...,
25
- min_length=2,
26
- description="3–6 concise bullet-point takeaways extracted from this segment.",
 
 
 
27
  )
28
 
29
 
30
  # ---------------------------------------------------------------------------
31
- # Primary response schema
32
  # ---------------------------------------------------------------------------
33
 
34
  class SummarySchema(BaseModel):
35
  """Top-level structured output returned by the LLM summarization call."""
36
 
37
- title: str = Field(..., description="Inferred title of the video.")
 
 
 
38
 
39
- quick_summary: str = Field(
40
  ...,
41
  description=(
42
- "A concise, high-level paragraph (3–5 sentences) that explains"
43
- " what the entire video is about, its main thesis, and its value"
44
- " to the viewer."
45
  ),
46
  )
47
 
48
- # STRICTLY ENFORCED — do NOT alter the allowed values.
49
- category: Literal[
50
- "Technology & AI",
51
- "Business & Finance",
52
- "Education & Science",
53
- "Productivity & Self-Growth",
54
- "News & Politics",
55
- "Entertainment & Lifestyle",
56
- "Health & Sports",
57
- ] = Field(
58
  ...,
59
  description=(
60
- "You MUST categorize the content into EXACTLY one of the 7 provided"
61
- " categories. Do not use any other label."
 
62
  ),
63
  )
64
 
@@ -68,25 +62,23 @@ class SummarySchema(BaseModel):
68
  min_length=3,
69
  max_length=7,
70
  description=(
71
- "Chronological chapters of the video. Standard videos MUST have"
72
- " 3–5 segments; exceptionally long or dense videos may use up to 7."
73
- " Segments must follow the natural progression of the transcript."
74
  ),
75
  ),
76
  ]
77
 
78
- keywords: List[str] = Field(
79
  ...,
80
- description="5–10 relevant topic tags for categorization and recommendations.",
81
  )
82
 
83
-
84
- # ---------------------------------------------------------------------------
85
- # Action-items schema (unchanged)
86
- # ---------------------------------------------------------------------------
87
-
88
- class ActionItemsSchema(BaseModel):
89
- action_items: List[str] = Field(
90
  ...,
91
- description="Actionable takeaways or tasks derived from the content.",
 
 
 
 
92
  )
 
1
+ from typing import Annotated, List
2
  from pydantic import BaseModel, Field
3
 
4
 
5
  # ---------------------------------------------------------------------------
6
+ # Timeline Segment schema
7
  # ---------------------------------------------------------------------------
8
 
9
  class SegmentSchema(BaseModel):
10
+ """Represents one chronological section of the video timeline."""
11
 
12
  title: str = Field(
13
  ...,
14
+ description="A short, descriptive title for this chronological segment.",
15
  )
16
+ summary: str = Field(
17
  ...,
18
+ description="A concise summary of what is covered in this segment (2-3 sentences).",
 
 
 
19
  )
20
+ key_insight: str = Field(
21
  ...,
22
+ description="The single most important point or takeaway from this segment.",
23
+ )
24
+ why_it_matters: str = Field(
25
+ ...,
26
+ description="Brief explanation of the value or importance of this segment (1-2 sentences).",
27
  )
28
 
29
 
30
  # ---------------------------------------------------------------------------
31
+ # Primary Summary response schema
32
  # ---------------------------------------------------------------------------
33
 
34
  class SummarySchema(BaseModel):
35
  """Top-level structured output returned by the LLM summarization call."""
36
 
37
+ title: str = Field(
38
+ ...,
39
+ description="Inferred title of the video in the transcript language.",
40
+ )
41
 
42
+ detected_language: str = Field(
43
  ...,
44
  description=(
45
+ "Detected language of the transcript, stated in English"
46
+ " (e.g. 'Arabic', 'English')."
 
47
  ),
48
  )
49
 
50
+ summary: str = Field(
 
 
 
 
 
 
 
 
 
51
  ...,
52
  description=(
53
+ "A concise, high-level paragraph (3-5 sentences) that explains"
54
+ " what the entire video is about, its main thesis, and its value"
55
+ " to the viewer."
56
  ),
57
  )
58
 
 
62
  min_length=3,
63
  max_length=7,
64
  description=(
65
+ "Chronological timeline sections of the video. Minimum 3,"
66
+ " maximum 7. Must follow the natural progression of the transcript."
 
67
  ),
68
  ),
69
  ]
70
 
71
+ conclusion: str = Field(
72
  ...,
73
+ description="A final overall takeaway or closing conclusion in the transcript language.",
74
  )
75
 
76
+ # Hidden metadata — not rendered in markdown, used by downstream modules.
77
+ topics: List[str] = Field(
 
 
 
 
 
78
  ...,
79
+ min_length=1,
80
+ description=(
81
+ "Dynamically extracted topics discussed in the video."
82
+ " Examples: ['Python', 'Machine Learning', 'Neural Networks']."
83
+ ),
84
  )