ALI7ADEL commited on
Commit
a5568e2
Β·
verified Β·
1 Parent(s): 4eadcae

Update src/summarization/note_generator.py

Browse files
Files changed (1) hide show
  1. src/summarization/note_generator.py +110 -225
src/summarization/note_generator.py CHANGED
@@ -1,27 +1,5 @@
1
- """
2
- AIdea - Premium Note Generator v3.0
3
- =====================================
4
- Drop-in upgrade for src/summarization/note_generator.py
5
-
6
- Preserved from original:
7
- - NoteGenerator class with same __init__ signature (reads GROQ_API_KEY from env)
8
- - generateSummary(transcript_text, video_title) -> Dict
9
- - extractActionItems(transcript_text, video_title) -> Dict
10
- - format_notes_to_markdown(json_notes) -> str
11
- - format_final_notes(notes, video_title, video_url, duration) -> str
12
- - _get_error_json(error_msg) -> Dict
13
- - setup_logger / settings imports preserved
14
-
15
- Upgraded:
16
- - New premium Pydantic schemas (SummarySchema, ActionItemsSchema updated in-place)
17
- - Auto language detection β€” all output strictly in transcript language
18
- - Core Topics / Batches replacing single flat summary
19
- - Rich Markdown: emojis, blockquotes, bold/italic, proper spacing
20
- - Flutter flutter_markdown optimized structure
21
- - Separated system/user prompt for better LLM instruction-following
22
- """
23
-
24
  import json
 
25
  import os
26
  from typing import Dict, List, Optional
27
 
@@ -29,11 +7,11 @@ from groq import Groq
29
  from pydantic import BaseModel, Field, ValidationError
30
 
31
  from src.utils.logger import setup_logger
32
- from src.utils.config import settings
 
33
 
34
  logger = setup_logger(__name__)
35
 
36
-
37
  # ─────────────────────────────────────────────────────────────────────────────
38
  # PYDANTIC SCHEMAS
39
  # ─────────────────────────────────────────────────────────────────────────────
@@ -43,75 +21,38 @@ class KeyConceptSchema(BaseModel):
43
  definition: str = Field(description="Clear, detailed explanation (2-4 sentences)")
44
  importance: str = Field(description="Why this concept matters in context (1-2 sentences)")
45
 
46
-
47
  class CoreTopicSchema(BaseModel):
48
- """One 'batch' / major topic section of the video."""
49
  title: str = Field(description="Short descriptive title for this topic batch")
50
  overview: str = Field(description="Rich paragraph overview of this topic (3-6 sentences, NOT bullet list)")
51
  key_points: List[str] = Field(description="3-6 detailed bullet points under this topic")
52
- insight: Optional[str] = Field(
53
- default=None,
54
- description="A notable insight, tip, or important warning for this topic"
55
- )
56
-
57
 
58
  class TimestampSchema(BaseModel):
59
  timestamp: str = Field(description="Timestamp in MM:SS format")
60
  topic: str = Field(description="Short title for this segment")
61
  summary: str = Field(description="1-2 sentence description of what happens at this point")
62
 
63
-
64
  class ActionItemEntrySchema(BaseModel):
65
  action: str = Field(description="A specific, actionable takeaway")
66
  rationale: str = Field(description="Why the viewer should do this (1 sentence)")
67
 
68
-
69
  class SummarySchema(BaseModel):
70
- """
71
- Main schema returned by generateSummary().
72
- Backward-compatible field names kept (title, summary, key_concepts,
73
- timestamps, keywords) plus new premium fields.
74
- """
75
- # ── Existing fields (kept for backward compat) ───────────────────────────
76
  title: str = Field(description="Inferred video title in the transcript language")
77
- summary: str = Field(
78
- description=(
79
- "Rich executive summary in the transcript language. "
80
- "4-6 sentences covering the main thesis, context, and value."
81
- )
82
- )
83
- key_concepts: List[KeyConceptSchema] = Field(
84
- description="3-8 key concepts/terms with definition AND importance"
85
- )
86
- timestamps: List[TimestampSchema] = Field(
87
- description="5-10 timeline entries covering the video's major segments"
88
- )
89
  keywords: List[str] = Field(description="5-10 relevant topic tags")
90
-
91
- # ── New premium fields ───────────────────────────────────────────────────
92
- detected_language: str = Field(
93
- description="Detected language of the transcript, stated in English (e.g. 'Arabic', 'English')"
94
- )
95
- hook: str = Field(
96
- description="A compelling 2-3 sentence teaser in the transcript language"
97
- )
98
- core_topics: List[CoreTopicSchema] = Field(
99
- description="2-5 major topic batches that organize the video content"
100
- )
101
- closing_thought: str = Field(
102
- description="A motivating or thought-provoking closing sentence in the transcript language"
103
- )
104
-
105
 
106
  class ActionItemsSchema(BaseModel):
107
- """Schema returned by extractActionItems()."""
108
- action_items: List[ActionItemEntrySchema] = Field(
109
- description="3-6 specific actionable takeaways, each with action + rationale"
110
- )
111
 
112
 
113
  # ─────────────────────────────────────────────────────────────────────────────
114
- # PROMPT TEMPLATES
115
  # ─────────────────────────────────────────────────────────────────────────────
116
 
117
  _SUMMARY_SYSTEM = """
@@ -120,24 +61,44 @@ Transform raw video transcripts into structured, deeply insightful JSON notes.
120
 
121
  LANGUAGE RULE β€” CRITICAL, NEVER VIOLATE:
122
  - Detect the primary language of the transcript.
123
- - Every single field in your JSON β€” titles, overviews, bullet points, insights,
124
- closing thoughts β€” MUST be written entirely in that SAME detected language.
125
- - Do NOT mix languages. Arabic transcript β†’ everything in Arabic. English β†’ everything in English.
126
  - Only the `detected_language` field itself is stated in English (e.g. "Arabic").
127
 
128
- CONTENT RULES:
129
- 1. `summary`: Write a rich flowing paragraph (4-6 sentences). Synthesize the thesis β€” do NOT list.
130
- 2. `core_topics`: Divide into 2-5 logical batches. Each needs:
131
- - A descriptive `title`
132
- - A rich `overview` paragraph (3-6 sentences, NOT bullets)
133
- - 3-6 detailed `key_points` bullet strings
134
- - Optional `insight`: a notable tip or important warning
135
- 3. `key_concepts`: For each term include `definition` AND `importance`.
136
- 4. `timestamps`: 5-10 meaningful entries with descriptive `topic` and `summary`.
137
- 5. Be SPECIFIC β€” reference actual names, numbers, examples from the transcript.
138
- 6. Be INSIGHTFUL β€” add synthesis, not just repetition.
139
-
140
- OUTPUT: Return ONLY a valid JSON object. No markdown fences, no preamble, no extra text.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  """.strip()
142
 
143
  _SUMMARY_USER = """
@@ -146,36 +107,36 @@ Video Title: {video_title}
146
  TRANSCRIPT:
147
  {transcript}
148
 
149
- Analyze thoroughly. Detect the language. Return the structured JSON notes β€” all content in the transcript's language.
150
  """.strip()
151
 
152
  _ACTIONS_SYSTEM = """
153
  You are an expert at extracting actionable insights from educational content.
154
 
155
  LANGUAGE RULE β€” CRITICAL:
156
- - Detect the primary language of the transcript.
157
- - Every `action` and `rationale` MUST be in that SAME detected language.
158
- - Do NOT mix languages.
159
-
160
- Return ONLY a valid JSON object:
161
- { "action_items": [ { "action": "...", "rationale": "..." } ] }
162
-
163
- Rules:
164
- - 3-6 specific, concrete action items (not vague)
165
- - Each `rationale` explains WHY the viewer should do this (1 sentence)
166
- - Be SPECIFIC β€” reference content from the transcript
167
  """.strip()
168
 
169
  _ACTIONS_USER = """
170
  TRANSCRIPT:
171
  {transcript}
172
 
173
- Extract actionable takeaways. Return JSON only.
174
  """.strip()
175
 
176
 
177
  # ─────────────────────────────────────────────────────────────────────────────
178
- # LANGUAGE LABEL MAPS
179
  # ─────────────────────────────────────────────────────────────────────────────
180
 
181
  _LABELS = {
@@ -212,7 +173,6 @@ _LABELS = {
212
  }
213
 
214
  def _labels(language: str) -> dict:
215
- """Return section labels for the detected language, defaulting to English."""
216
  return _LABELS.get(language, _LABELS["English"])
217
 
218
 
@@ -221,31 +181,20 @@ def _labels(language: str) -> dict:
221
  # ─────────────────────────────────────────────────────────────────────────────
222
 
223
  class NoteGenerator:
224
- """
225
- Generates premium structured study notes using Groq (Llama-3.3-70b-versatile).
226
-
227
- Public API (unchanged from v1):
228
- generateSummary(transcript_text, video_title) -> Dict
229
- extractActionItems(transcript_text, video_title) -> Dict
230
- format_notes_to_markdown(json_notes) -> str
231
- format_final_notes(notes, video_title, video_url, duration) -> str
232
- """
233
 
234
  def __init__(self):
235
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
236
  self.client = Groq(api_key=self.api_key) if self.api_key else None
237
  self.model_id = "llama-3.3-70b-versatile"
238
- logger.info(f"πŸš€ NoteGenerator v3.0 initialized β€” model: {self.model_id}")
239
-
240
- # ── Private helpers ───────────────────────────────────────────────────────
241
 
242
  def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
243
- """Make a Groq API call and return raw content string, or None on failure."""
244
  try:
245
  response = self.client.chat.completions.create(
246
  model=self.model_id,
247
  max_tokens=max_tokens,
248
- temperature=0.4,
249
  response_format={"type": "json_object"},
250
  messages=[
251
  {"role": "system", "content": system},
@@ -270,18 +219,11 @@ class NoteGenerator:
270
  "closing_thought": "",
271
  }
272
 
273
- # ── Core generation methods ───────────────────────────────────────────────
274
-
275
  def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
276
- """
277
- Generate premium structured notes from transcript.
278
- Returns a dict matching SummarySchema (all content in transcript language).
279
- """
280
  if not self.client:
281
  return self._get_error_json("Groq API Key missing.")
282
 
283
- logger.info(f"πŸ“ Summary generation started β€” model: {self.model_id}")
284
-
285
  user_prompt = _SUMMARY_USER.format(
286
  video_title=video_title,
287
  transcript=transcript_text[:30000],
@@ -294,26 +236,16 @@ class NoteGenerator:
294
  try:
295
  data = json.loads(raw)
296
  validated = SummarySchema(**data)
297
- logger.info(
298
- f"βœ… Summary done β€” lang: {validated.detected_language}, "
299
- f"topics: {len(validated.core_topics)}, "
300
- f"concepts: {len(validated.key_concepts)}"
301
- )
302
  return validated.model_dump()
303
  except (json.JSONDecodeError, ValidationError) as e:
304
  logger.error(f"❌ Schema validation failed: {e}")
305
- return self._get_error_json(str(e))
306
 
307
  def extractActionItems(self, transcript_text: str, video_title: str) -> Dict:
308
- """
309
- Extract rich action items (action + rationale) from transcript.
310
- Returns a dict: { "action_items": [ {"action": ..., "rationale": ...} ] }
311
- """
312
  if not self.client:
313
  return {"action_items": []}
314
 
315
- logger.info(f"βœ… Action items extraction started β€” model: {self.model_id}")
316
-
317
  user_prompt = _ACTIONS_USER.format(transcript=transcript_text[:20000])
318
 
319
  raw = self._chat(_ACTIONS_SYSTEM, user_prompt, max_tokens=1024)
@@ -323,28 +255,12 @@ class NoteGenerator:
323
  try:
324
  data = json.loads(raw)
325
  validated = ActionItemsSchema(**data)
326
- logger.info(f"βœ… Action items done β€” count: {len(validated.action_items)}")
327
  return validated.model_dump()
328
  except (json.JSONDecodeError, ValidationError) as e:
329
  logger.error(f"❌ Action items validation failed: {e}")
330
  return {"action_items": []}
331
 
332
- # ── Formatting methods ────────────────────────────────────────────────────
333
-
334
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
335
- """
336
- Convert the merged Dict from generateSummary() + extractActionItems()
337
- into premium Flutter-optimized Markdown.
338
-
339
- Flutter MarkdownStyleSheet targets:
340
- # β†’ h1Style (hero title, primary color)
341
- ## β†’ h2Style (section headers, accent color, bold)
342
- ### β†’ h3Style (topic batch titles, secondary color)
343
- > β†’ blockquoteDecoration (light card background, left border)
344
- ** β†’ bold emphasis
345
- * β†’ italic / lighter gray text
346
- ` ` β†’ codeStyle (monospace font for timestamps)
347
- """
348
  lang = json_notes.get("detected_language", "English")
349
  L = _labels(lang)
350
  lines: list[str] = []
@@ -360,7 +276,7 @@ class NoteGenerator:
360
  lines.append("---")
361
  lines.append("")
362
 
363
- # ── HOOK / TEASER ─────────────────────────────────────────────────────
364
  hook = json_notes.get("hook", "")
365
  if hook:
366
  add(f"## ✨ {L['hook']}")
@@ -368,7 +284,7 @@ class NoteGenerator:
368
  add(f"*{hook}*")
369
  divider()
370
 
371
- # ── EXECUTIVE SUMMARY ─────────────────────────────────────────────────
372
  summary = json_notes.get("summary", "")
373
  if summary:
374
  add(f"## πŸ“‹ {L['summary']}")
@@ -376,23 +292,16 @@ class NoteGenerator:
376
  add(summary)
377
  divider()
378
 
379
- # ── CORE TOPICS / BATCHES ─────────────────────────────────────────────
380
  core_topics = json_notes.get("core_topics", [])
381
  if core_topics:
382
  add(f"## 🧠 {L['core_topics']}")
383
  blank()
384
  for i, topic in enumerate(core_topics, start=1):
385
- # Support both dict (model_dump output) and Pydantic objects
386
- if isinstance(topic, dict):
387
- t_title = topic.get("title", "")
388
- t_overview = topic.get("overview", "")
389
- t_points = topic.get("key_points", [])
390
- t_insight = topic.get("insight")
391
- else:
392
- t_title = topic.title
393
- t_overview = topic.overview
394
- t_points = topic.key_points
395
- t_insight = topic.insight
396
 
397
  add(f"### {i}. {t_title}")
398
  blank()
@@ -402,27 +311,19 @@ class NoteGenerator:
402
  add(f"- {point}")
403
  blank()
404
  if t_insight:
405
- add(f"> {L['insight']}")
406
- add(f"> {t_insight}")
407
  blank()
 
408
 
409
- add("---")
410
- blank()
411
-
412
- # ── KEY CONCEPTS ──────────────────────────────────────────────────────
413
  key_concepts = json_notes.get("key_concepts", [])
414
  if key_concepts:
415
  add(f"## πŸ’‘ {L['concepts']}")
416
  blank()
417
  for concept in key_concepts:
418
- if isinstance(concept, dict):
419
- term = concept.get("term", "")
420
- definition = concept.get("definition", "")
421
- importance = concept.get("importance", "")
422
- else:
423
- term = concept.term
424
- definition = concept.definition
425
- importance = concept.importance
426
 
427
  add(f"**{term}**")
428
  blank()
@@ -431,60 +332,40 @@ class NoteGenerator:
431
  if importance:
432
  add(f"> *{L['importance']}: {importance}*")
433
  blank()
 
434
 
435
- add("---")
436
- blank()
437
-
438
- # ── TIMELINE ──────────────────────────────────────────────────────────
439
  timestamps = json_notes.get("timestamps", [])
440
  if timestamps:
441
  add(f"## ⏱️ {L['timeline']}")
442
  blank()
443
  for entry in timestamps:
444
- if isinstance(entry, dict):
445
- ts = entry.get("timestamp", "")
446
- topic = entry.get("topic", "")
447
- ts_summary = entry.get("summary", "")
448
- else:
449
- ts = entry.timestamp
450
- topic = entry.topic
451
- ts_summary = entry.summary
452
-
453
- # Two-space trailing = line break in Markdown (inline subtitle)
454
- add(f"- **`{ts}`** β€” **{topic}** ")
455
  add(f" {ts_summary}")
 
456
  divider()
457
 
458
- # ── ACTION ITEMS ──────────────────────────────────────────────────────
459
  action_items = json_notes.get("action_items", [])
460
  if action_items:
461
  add(f"## 🎯 {L['actions']}")
462
  blank()
463
  for idx, item in enumerate(action_items, start=1):
464
- if isinstance(item, str):
465
- # Backward compat: old plain-string format
466
- add(f"**{idx}.** {item}")
467
- blank()
468
- elif isinstance(item, dict):
469
- action = item.get("action", "")
470
- rationale = item.get("rationale", "")
471
- add(f"**{idx}. {action}**")
472
- blank()
473
- if rationale:
474
- add(f"> *{L['why']} {rationale}*")
475
- blank()
476
- else:
477
- # Pydantic object
478
- add(f"**{idx}. {item.action}**")
479
- blank()
480
- if item.rationale:
481
- add(f"> *{L['why']} {item.rationale}*")
482
- blank()
483
-
484
- add("---")
485
- blank()
486
 
487
- # ── KEYWORDS / TAGS ───────────────────────────────────────────────────
488
  keywords = json_notes.get("keywords", [])
489
  if keywords:
490
  add(f"## 🏷️ {L['tags']}")
@@ -492,7 +373,7 @@ class NoteGenerator:
492
  add(" ".join([f"`{kw}`" for kw in keywords]))
493
  divider()
494
 
495
- # ── CLOSING THOUGHT ───────────────────────────────────────────────────
496
  closing = json_notes.get("closing_thought", "")
497
  if closing:
498
  add(f"## πŸ”– {L['closing']}")
@@ -511,11 +392,15 @@ class NoteGenerator:
511
  ) -> str:
512
  """
513
  Wrap the formatted Markdown body with the video header.
514
- `duration` is in seconds (int), matching original signature.
515
  """
516
- minutes = int(duration // 60)
517
- secs = int(duration % 60)
518
- duration_str = f"{minutes:02d}:{secs:02d}"
 
 
 
 
519
 
520
  header = (
521
  f"# {video_title}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import logging
3
  import os
4
  from typing import Dict, List, Optional
5
 
 
7
  from pydantic import BaseModel, Field, ValidationError
8
 
9
  from src.utils.logger import setup_logger
10
+ # ΨͺΨ£ΩƒΨ― Ψ₯Ω† Ω…Ψ³Ψ§Ψ± Ψ§Ω„Ω€ settings Ψ΅Ψ­ Ψ­Ψ³Ψ¨ Ω…Ψ΄Ψ±ΩˆΨΉΩƒ
11
+ from src.utils.config import settings
12
 
13
  logger = setup_logger(__name__)
14
 
 
15
  # ─────────────────────────────────────────────────────────────────────────────
16
  # PYDANTIC SCHEMAS
17
  # ─────────────────────────────────────────────────────────────────────────────
 
21
  definition: str = Field(description="Clear, detailed explanation (2-4 sentences)")
22
  importance: str = Field(description="Why this concept matters in context (1-2 sentences)")
23
 
 
24
  class CoreTopicSchema(BaseModel):
 
25
  title: str = Field(description="Short descriptive title for this topic batch")
26
  overview: str = Field(description="Rich paragraph overview of this topic (3-6 sentences, NOT bullet list)")
27
  key_points: List[str] = Field(description="3-6 detailed bullet points under this topic")
28
+ insight: Optional[str] = Field(default=None, description="A notable insight, tip, or important warning for this topic")
 
 
 
 
29
 
30
  class TimestampSchema(BaseModel):
31
  timestamp: str = Field(description="Timestamp in MM:SS format")
32
  topic: str = Field(description="Short title for this segment")
33
  summary: str = Field(description="1-2 sentence description of what happens at this point")
34
 
 
35
  class ActionItemEntrySchema(BaseModel):
36
  action: str = Field(description="A specific, actionable takeaway")
37
  rationale: str = Field(description="Why the viewer should do this (1 sentence)")
38
 
 
39
  class SummarySchema(BaseModel):
 
 
 
 
 
 
40
  title: str = Field(description="Inferred video title in the transcript language")
41
+ detected_language: str = Field(description="Detected language of the transcript, stated in English (e.g. 'Arabic', 'English')")
42
+ hook: str = Field(description="A compelling 2-3 sentence teaser in the transcript language")
43
+ summary: str = Field(description="Rich executive summary. 4-6 sentences covering the main thesis, context, and value.")
44
+ core_topics: List[CoreTopicSchema] = Field(description="2-5 major topic batches that organize the video content")
45
+ key_concepts: List[KeyConceptSchema] = Field(description="3-8 key concepts/terms with definition AND importance")
46
+ timestamps: List[TimestampSchema] = Field(description="5-10 timeline entries covering the video's major segments")
 
 
 
 
 
 
47
  keywords: List[str] = Field(description="5-10 relevant topic tags")
48
+ closing_thought: str = Field(description="A motivating or thought-provoking closing sentence in the transcript language")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  class ActionItemsSchema(BaseModel):
51
+ action_items: List[ActionItemEntrySchema] = Field(description="3-6 specific actionable takeaways, each with action + rationale")
 
 
 
52
 
53
 
54
  # ─────────────────────────────────────────────────────────────────────────────
55
+ # PROMPT TEMPLATES (STRICT JSON ENFORCEMENT)
56
  # ─────────────────────────────────────────────────────────────────────────────
57
 
58
  _SUMMARY_SYSTEM = """
 
61
 
62
  LANGUAGE RULE β€” CRITICAL, NEVER VIOLATE:
63
  - Detect the primary language of the transcript.
64
+ - Every single field MUST be written entirely in that SAME detected language.
65
+ - Do NOT mix languages. Arabic transcript -> everything in Arabic.
 
66
  - Only the `detected_language` field itself is stated in English (e.g. "Arabic").
67
 
68
+ CRITICAL: YOU MUST RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
69
+ DO NOT CHANGE, OMIT, OR RENAME ANY KEYS (e.g., never use "time" instead of "timestamp").
70
+ {
71
+ "title": "Inferred video title",
72
+ "detected_language": "English (or Arabic, etc.)",
73
+ "hook": "A compelling 2-3 sentence teaser",
74
+ "summary": "Rich flowing paragraph (4-6 sentences)",
75
+ "core_topics": [
76
+ {
77
+ "title": "Topic title",
78
+ "overview": "Rich paragraph overview (NOT bullet points)",
79
+ "key_points": ["point 1", "point 2"],
80
+ "insight": "Notable tip or warning"
81
+ }
82
+ ],
83
+ "key_concepts": [
84
+ {
85
+ "term": "Concept name",
86
+ "definition": "Clear explanation",
87
+ "importance": "Why it matters"
88
+ }
89
+ ],
90
+ "timestamps": [
91
+ {
92
+ "timestamp": "MM:SS",
93
+ "topic": "Segment title",
94
+ "summary": "What happens here"
95
+ }
96
+ ],
97
+ "keywords": ["tag1", "tag2"],
98
+ "closing_thought": "Motivating closing sentence"
99
+ }
100
+
101
+ OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
102
  """.strip()
103
 
104
  _SUMMARY_USER = """
 
107
  TRANSCRIPT:
108
  {transcript}
109
 
110
+ Analyze thoroughly. Detect the language. Return ONLY the exact JSON structure requested.
111
  """.strip()
112
 
113
  _ACTIONS_SYSTEM = """
114
  You are an expert at extracting actionable insights from educational content.
115
 
116
  LANGUAGE RULE β€” CRITICAL:
117
+ - Detect the primary language of the transcript and output entirely in that language.
118
+
119
+ CRITICAL: Return ONLY a valid JSON object EXACTLY matching this structure:
120
+ {
121
+ "action_items": [
122
+ {
123
+ "action": "A specific, actionable takeaway",
124
+ "rationale": "Why the viewer should do this"
125
+ }
126
+ ]
127
+ }
128
  """.strip()
129
 
130
  _ACTIONS_USER = """
131
  TRANSCRIPT:
132
  {transcript}
133
 
134
+ Extract actionable takeaways. Return JSON only matching the requested structure.
135
  """.strip()
136
 
137
 
138
  # ─────────────────────────────────────────────────────────────────────────────
139
+ # LANGUAGE LABEL MAPS (For UI localization)
140
  # ─────────────────────────────────────────────────────────────────────────────
141
 
142
  _LABELS = {
 
173
  }
174
 
175
  def _labels(language: str) -> dict:
 
176
  return _LABELS.get(language, _LABELS["English"])
177
 
178
 
 
181
  # ─────────────────────────────────────────────────────────────────────────────
182
 
183
  class NoteGenerator:
184
+ """Generates premium structured study notes using Groq (Llama-3.3-70b-versatile)."""
 
 
 
 
 
 
 
 
185
 
186
  def __init__(self):
187
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
188
  self.client = Groq(api_key=self.api_key) if self.api_key else None
189
  self.model_id = "llama-3.3-70b-versatile"
190
+ logger.info(f"πŸš€ NoteGenerator v3.1 initialized β€” model: {self.model_id}")
 
 
191
 
192
  def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
 
193
  try:
194
  response = self.client.chat.completions.create(
195
  model=self.model_id,
196
  max_tokens=max_tokens,
197
+ temperature=0.3, # ΨͺΩ‚Ω„ΩŠΩ„ Ψ§Ω„Ψ₯Ψ¨Ψ―Ψ§ΨΉ Ω„ΨΆΩ…Ψ§Ω† Ψ§Ω„ΨͺΨ²Ψ§Ω… Ψ§Ω„Ω‚Ψ§Ω„Ψ¨
198
  response_format={"type": "json_object"},
199
  messages=[
200
  {"role": "system", "content": system},
 
219
  "closing_thought": "",
220
  }
221
 
 
 
222
  def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
 
 
 
 
223
  if not self.client:
224
  return self._get_error_json("Groq API Key missing.")
225
 
226
+ logger.info(f"πŸ“ Summary generation started via {self.model_id}")
 
227
  user_prompt = _SUMMARY_USER.format(
228
  video_title=video_title,
229
  transcript=transcript_text[:30000],
 
236
  try:
237
  data = json.loads(raw)
238
  validated = SummarySchema(**data)
 
 
 
 
 
239
  return validated.model_dump()
240
  except (json.JSONDecodeError, ValidationError) as e:
241
  logger.error(f"❌ Schema validation failed: {e}")
242
+ return self._get_error_json(f"Validation Error: {str(e)}")
243
 
244
  def extractActionItems(self, transcript_text: str, video_title: str) -> Dict:
 
 
 
 
245
  if not self.client:
246
  return {"action_items": []}
247
 
248
+ logger.info(f"βœ… Action items extraction started via {self.model_id}")
 
249
  user_prompt = _ACTIONS_USER.format(transcript=transcript_text[:20000])
250
 
251
  raw = self._chat(_ACTIONS_SYSTEM, user_prompt, max_tokens=1024)
 
255
  try:
256
  data = json.loads(raw)
257
  validated = ActionItemsSchema(**data)
 
258
  return validated.model_dump()
259
  except (json.JSONDecodeError, ValidationError) as e:
260
  logger.error(f"❌ Action items validation failed: {e}")
261
  return {"action_items": []}
262
 
 
 
263
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  lang = json_notes.get("detected_language", "English")
265
  L = _labels(lang)
266
  lines: list[str] = []
 
276
  lines.append("---")
277
  lines.append("")
278
 
279
+ # ── HOOK ──
280
  hook = json_notes.get("hook", "")
281
  if hook:
282
  add(f"## ✨ {L['hook']}")
 
284
  add(f"*{hook}*")
285
  divider()
286
 
287
+ # ── SUMMARY ──
288
  summary = json_notes.get("summary", "")
289
  if summary:
290
  add(f"## πŸ“‹ {L['summary']}")
 
292
  add(summary)
293
  divider()
294
 
295
+ # ── CORE TOPICS ──
296
  core_topics = json_notes.get("core_topics", [])
297
  if core_topics:
298
  add(f"## 🧠 {L['core_topics']}")
299
  blank()
300
  for i, topic in enumerate(core_topics, start=1):
301
+ t_title = topic.get("title", "") if isinstance(topic, dict) else topic.title
302
+ t_overview = topic.get("overview", "") if isinstance(topic, dict) else topic.overview
303
+ t_points = topic.get("key_points", []) if isinstance(topic, dict) else topic.key_points
304
+ t_insight = topic.get("insight") if isinstance(topic, dict) else topic.insight
 
 
 
 
 
 
 
305
 
306
  add(f"### {i}. {t_title}")
307
  blank()
 
311
  add(f"- {point}")
312
  blank()
313
  if t_insight:
314
+ add(f"> **{L['insight']}:** {t_insight}")
 
315
  blank()
316
+ divider()
317
 
318
+ # ── KEY CONCEPTS ──
 
 
 
319
  key_concepts = json_notes.get("key_concepts", [])
320
  if key_concepts:
321
  add(f"## πŸ’‘ {L['concepts']}")
322
  blank()
323
  for concept in key_concepts:
324
+ term = concept.get("term", "") if isinstance(concept, dict) else concept.term
325
+ definition = concept.get("definition", "") if isinstance(concept, dict) else concept.definition
326
+ importance = concept.get("importance", "") if isinstance(concept, dict) else concept.importance
 
 
 
 
 
327
 
328
  add(f"**{term}**")
329
  blank()
 
332
  if importance:
333
  add(f"> *{L['importance']}: {importance}*")
334
  blank()
335
+ divider()
336
 
337
+ # ── TIMELINE ──
 
 
 
338
  timestamps = json_notes.get("timestamps", [])
339
  if timestamps:
340
  add(f"## ⏱️ {L['timeline']}")
341
  blank()
342
  for entry in timestamps:
343
+ ts = entry.get("timestamp", "") if isinstance(entry, dict) else entry.timestamp
344
+ topic = entry.get("topic", "") if isinstance(entry, dict) else entry.topic
345
+ ts_summary = entry.get("summary", "") if isinstance(entry, dict) else entry.summary
346
+
347
+ add(f"- **`{ts}`** β€” **{topic}** ")
 
 
 
 
 
 
348
  add(f" {ts_summary}")
349
+ blank()
350
  divider()
351
 
352
+ # ── ACTION ITEMS ──
353
  action_items = json_notes.get("action_items", [])
354
  if action_items:
355
  add(f"## 🎯 {L['actions']}")
356
  blank()
357
  for idx, item in enumerate(action_items, start=1):
358
+ action = item.get("action", "") if isinstance(item, dict) else item.action
359
+ rationale = item.get("rationale", "") if isinstance(item, dict) else item.rationale
360
+
361
+ add(f"**{idx}. {action}**")
362
+ blank()
363
+ if rationale:
364
+ add(f"> *{L['why']} {rationale}*")
365
+ blank()
366
+ divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
+ # ── KEYWORDS ──
369
  keywords = json_notes.get("keywords", [])
370
  if keywords:
371
  add(f"## 🏷️ {L['tags']}")
 
373
  add(" ".join([f"`{kw}`" for kw in keywords]))
374
  divider()
375
 
376
+ # ── CLOSING THOUGHT ──
377
  closing = json_notes.get("closing_thought", "")
378
  if closing:
379
  add(f"## πŸ”– {L['closing']}")
 
392
  ) -> str:
393
  """
394
  Wrap the formatted Markdown body with the video header.
395
+ Fixes the 00:00 duration bug.
396
  """
397
+ # Ψ₯Ψ°Ψ§ ΩƒΨ§Ω†Ψͺ Ψ§Ω„Ω…Ψ―Ψ© ءفر أو غير Ω…ΩˆΨ¬ΩˆΨ―Ψ©
398
+ if duration and duration > 0:
399
+ minutes = int(duration // 60)
400
+ secs = int(duration % 60)
401
+ duration_str = f"{minutes:02d}:{secs:02d}"
402
+ else:
403
+ duration_str = "N/A (Auto-generated)"
404
 
405
  header = (
406
  f"# {video_title}\n\n"