amanyelfiky commited on
Commit
c00d17d
·
1 Parent(s): 3d4b8b5
pyproject.toml CHANGED
@@ -11,13 +11,13 @@ dependencies = [
11
  "email-validator>=2.3.0",
12
  "fastapi==0.109.0",
13
  "google-api-python-client==2.115.0",
14
- "groq>=0.9.0",
15
  "google-genai>=1.2.0",
16
  "google-generativeai==0.3.2",
17
  "greenlet==3.3.1",
18
  "httpx==0.26.0",
19
  "langchain==0.1.0",
20
  "langchain-google-genai==0.0.5",
 
21
  "openai-whisper==20250625",
22
  "passlib[bcrypt]==1.7.4",
23
  "pydantic-core==2.41.5",
@@ -27,9 +27,11 @@ dependencies = [
27
  "python-dotenv==1.0.0",
28
  "python-jose[cryptography]==3.3.0",
29
  "python-multipart==0.0.6",
 
30
  "sqlmodel==0.0.14",
31
  "torch>=2.10.0",
32
  "torchaudio>=2.10.0",
 
33
  "uvicorn[standard]==0.27.0",
34
  ]
35
 
 
11
  "email-validator>=2.3.0",
12
  "fastapi==0.109.0",
13
  "google-api-python-client==2.115.0",
 
14
  "google-genai>=1.2.0",
15
  "google-generativeai==0.3.2",
16
  "greenlet==3.3.1",
17
  "httpx==0.26.0",
18
  "langchain==0.1.0",
19
  "langchain-google-genai==0.0.5",
20
+ "langdetect>=1.0.9",
21
  "openai-whisper==20250625",
22
  "passlib[bcrypt]==1.7.4",
23
  "pydantic-core==2.41.5",
 
27
  "python-dotenv==1.0.0",
28
  "python-jose[cryptography]==3.3.0",
29
  "python-multipart==0.0.6",
30
+ "sentencepiece>=0.2.0",
31
  "sqlmodel==0.0.14",
32
  "torch>=2.10.0",
33
  "torchaudio>=2.10.0",
34
+ "transformers>=4.40.0",
35
  "uvicorn[standard]==0.27.0",
36
  ]
37
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
2
  assemblyai>=0.30.0
3
  yt-dlp>=2025.05.22
@@ -48,3 +49,51 @@ pytubefix
48
  # --- ML & Recommendations ---
49
  # keybert
50
  # sentence-transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
3
  assemblyai>=0.30.0
4
  yt-dlp>=2025.05.22
 
49
  # --- ML & Recommendations ---
50
  # keybert
51
  # sentence-transformers
52
+ =======
53
+ # --- YouTube Transcription Pipeline (The Waterfall Strategy) ---
54
+ assemblyai>=0.30.0
55
+ yt-dlp>=2025.05.22
56
+ bgutil-ytdlp-pot-provider==1.3.1
57
+ youtube-transcript-api==0.6.2
58
+
59
+ # --- AI, LLMs & Transcription Fallback ---
60
+ openai-whisper==20250625
61
+ torch
62
+ torchaudio
63
+ transformers>=4.40.0
64
+ sentencepiece>=0.2.0
65
+ langdetect>=1.0.9
66
+ google-genai
67
+ google-generativeai==0.3.2
68
+ langchain==0.1.0
69
+ langchain-google-genai==0.0.5
70
+
71
+ # --- Backend Infrastructure (FastAPI) ---
72
+ fastapi==0.109.0
73
+ uvicorn[standard]==0.27.0
74
+ pydantic==2.12.5
75
+ pydantic[email]
76
+ email-validator
77
+ pydantic-settings==2.1.0
78
+ python-multipart==0.0.6
79
+ python-dotenv==1.0.0
80
+ httpx==0.26.0
81
+ aiofiles==23.2.1
82
+
83
+ # --- Database, Auth & Security ---
84
+ sqlmodel==0.0.14
85
+ asyncpg==0.31.0
86
+ greenlet==3.3.1
87
+ passlib[bcrypt]==1.7.4
88
+ python-jose[cryptography]==3.3.0
89
+ bcrypt==4.1.2
90
+ pydantic-core==2.41.5
91
+
92
+ # --- Integration & Utilities ---
93
+ google-api-python-client==2.115.0
94
+ firebase-admin==6.5.0
95
+ dnspython
96
+ pydub==0.25.1
97
+ ffmpeg-python
98
+
99
+ >>>>>>> b5ab912 (Mt5)
src/summarization/README.md CHANGED
@@ -5,7 +5,7 @@ This module handles **text summarization and conversion to structured study note
5
 
6
  ## Functionality
7
  1. Receive transcribed text from videos.
8
- 2. Use **Groq (Llama-3.3-70b-versatile)** to analyze text and generate structured JSON notes.
9
  3. Produce clean Markdown output with:
10
  - Source & Duration header
11
  - Overall Summary
@@ -21,7 +21,7 @@ This module handles **text summarization and conversion to structured study note
21
  - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
22
 
23
  ### 2. `note_generator.py`
24
- - **Purpose:** Generate notes using Groq AI with strict JSON enforcement.
25
  - **Main Class:** `NoteGenerator`
26
  - **Key Methods:**
27
  - `generateSummary(transcript, title)` — Generates structured JSON study notes.
@@ -32,7 +32,9 @@ This module handles **text summarization and conversion to structured study note
32
  - **Purpose:** Split long texts into smaller segments for preprocessing.
33
  - **Main Class:** `TranscriptSegmenter`
34
  - **Key Methods:**
 
35
  - `segment_by_time()` — Split by time intervals.
 
36
  - `clean_text()` — Remove filler words.
37
 
38
  ## JSON Output Structure
@@ -90,5 +92,13 @@ print(notes_md)
90
  ```
91
 
92
  ## Libraries Used
93
- - `groq` — Communicate with Groq API (Llama-3.3-70b-versatile).
 
 
 
94
  - `pydantic` — Data validation and schema enforcement.
 
 
 
 
 
 
5
 
6
  ## Functionality
7
  1. Receive transcribed text from videos.
8
+ 2. Use a **local mT5 model** (map-reduce pipeline) to analyze text and generate structured JSON notes.
9
  3. Produce clean Markdown output with:
10
  - Source & Duration header
11
  - Overall Summary
 
21
  - `SegmentSchema` — A timeline section (title, summary, key_insight, why_it_matters).
22
 
23
  ### 2. `note_generator.py`
24
+ - **Purpose:** Generate notes using a local mT5 model with chunk-based map-reduce and schema validation.
25
  - **Main Class:** `NoteGenerator`
26
  - **Key Methods:**
27
  - `generateSummary(transcript, title)` — Generates structured JSON study notes.
 
32
  - **Purpose:** Split long texts into smaller segments for preprocessing.
33
  - **Main Class:** `TranscriptSegmenter`
34
  - **Key Methods:**
35
+ - `segment_text_by_words()` — Split text into fixed-size word chunks (used by the mT5 pipeline).
36
  - `segment_by_time()` — Split by time intervals.
37
+ - `segment_by_topic()` — Split by paragraph/topic boundaries.
38
  - `clean_text()` — Remove filler words.
39
 
40
  ## JSON Output Structure
 
92
  ```
93
 
94
  ## Libraries Used
95
+ - `transformers` — Load and run the local mT5 model (HuggingFace).
96
+ - `sentencepiece` — Tokenizer backend required by mT5.
97
+ - `langdetect` — Automatic language detection for multilingual support.
98
+ - `torch` — PyTorch runtime for model inference.
99
  - `pydantic` — Data validation and schema enforcement.
100
+
101
+ ## Environment Variables
102
+ | Variable | Default | Description |
103
+ |----------|---------|-------------|
104
+ | `MT5_MODEL_NAME` | `google/mt5-small` | HuggingFace model ID to load |
src/summarization/__pycache__/note_generator.cpython-312.pyc DELETED
Binary file (11.2 kB)
 
src/summarization/note_generator.py CHANGED
@@ -1,41 +1,22 @@
1
  import json
2
  import os
3
- import re
4
- import time
5
- from typing import Dict, List, Optional
6
 
7
- from groq import Groq
 
8
  from pydantic import ValidationError
 
9
 
10
  from ..utils.logger import setup_logger
11
  from .schemas import SummarySchema
 
12
 
13
 
14
  logger = setup_logger(__name__)
15
 
16
 
17
  # ─────────────────────────────────────────────────────────────────────────────
18
- # CONFIGURATION
19
- # ─────────────────────────────────────────────────────────────────────────────
20
-
21
- # Token threshold: below this, a single API call is used.
22
- _SINGLE_PASS_TOKEN_LIMIT = 8_000
23
-
24
- # Target chunk size for MAP phase (tokens). Kept small so that
25
- # prompt + chunk + response stays well under the 12K TPM free-tier limit.
26
- _CHUNK_TARGET_TOKENS = 2_500
27
-
28
- # Model — unified for both MAP and REDUCE phases.
29
- # llama-3.3-70b-versatile has 12K TPM on the free tier (the highest).
30
- _MODEL_PRIMARY = "llama-3.3-70b-versatile"
31
-
32
- # Maximum retries when a rate-limit (413 / 429) is hit.
33
- _RATE_LIMIT_MAX_RETRIES = 3
34
- _RATE_LIMIT_SLEEP_SECONDS = 60
35
-
36
-
37
- # ─────────────────────────────────────────────────────────────────────────────
38
- # PROMPT TEMPLATES — SINGLE-PASS (unchanged)
39
  # ─────────────────────────────────────────────────────────────────────────────
40
 
41
  _SUMMARY_SYSTEM = """
@@ -46,7 +27,7 @@ LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
46
  - Detect the primary language of the transcript.
47
  - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
48
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
49
- - Only the "detected_language" and "suggested_category" fields are stated in English.
50
 
51
  TIMELINE RULES — STRICTLY ENFORCED:
52
  - Divide the transcript into chronological segments that follow its natural progression.
@@ -64,12 +45,6 @@ TOPICS RULE:
64
  - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
65
  - Do NOT use generic fixed categories.
66
 
67
- CATEGORY RULE:
68
- - Provide a single, concise category label (1-2 words max) in English.
69
- - This should be the most accurate high-level category for the video content.
70
- - Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
71
- - The suggested_category MUST always be in English regardless of the transcript language.
72
-
73
  CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
74
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
75
  {
@@ -85,8 +60,7 @@ DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
85
  }
86
  ],
87
  "conclusion": "Final overall takeaway / closing conclusion",
88
- "topics": ["Topic1", "Topic2", "Topic3"],
89
- "suggested_category": "Programming"
90
  }
91
 
92
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
@@ -105,109 +79,6 @@ Return ONLY the exact JSON structure requested.
105
  """.strip()
106
 
107
 
108
- # ─────────────────────────────────────────────────────────────────────────────
109
- # PROMPT TEMPLATES — MAP PHASE
110
- # ─────────────────────────────────────────────────────────────────────────────
111
-
112
- _MAP_SYSTEM = """
113
- You are an expert educational content analyst.
114
- You will receive ONE CHUNK of a longer video transcript.
115
- Extract the key information from this chunk ONLY.
116
-
117
- LANGUAGE RULE — CRITICAL:
118
- - Detect the primary language of the text.
119
- - Write ALL content fields in that SAME detected language.
120
- - Only "detected_language" is stated in English.
121
-
122
- Return a JSON object with this EXACT structure:
123
- {
124
- "detected_language": "English (or Arabic, etc.)",
125
- "chunk_summary": "Concise summary of this chunk (3-5 sentences)",
126
- "key_points": [
127
- {
128
- "title": "Short title for this point",
129
- "detail": "1-2 sentence explanation",
130
- "insight": "Key takeaway"
131
- }
132
- ],
133
- "topics": ["Topic1", "Topic2"]
134
- }
135
-
136
- RULES:
137
- - Extract 2-4 key points from this chunk.
138
- - Topics should be specific (e.g. "Python", "Neural Networks"), not generic.
139
- - OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
140
- """.strip()
141
-
142
- _MAP_USER = """
143
- Video Title: {video_title}
144
- Chunk {chunk_index} of {total_chunks}:
145
-
146
- {chunk_text}
147
-
148
- Extract the key information from this chunk. Return ONLY the JSON.
149
- """.strip()
150
-
151
-
152
- # ─────────────────────────────────────────────────────────────────────────────
153
- # PROMPT TEMPLATES — REDUCE PHASE
154
- # ─────────────────────────────────────────────────────────────────────────────
155
-
156
- _REDUCE_SYSTEM = """
157
- You are an expert educational content analyst and structured note-taking specialist.
158
- You will receive INTERMEDIATE SUMMARIES from multiple chunks of a single video transcript.
159
- Your job is to MERGE them into ONE final, cohesive, structured summary.
160
-
161
- LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
162
- - Use the detected language from the intermediate summaries.
163
- - Every content field MUST be in that SAME language.
164
- - Only "detected_language" and "suggested_category" are stated in English.
165
-
166
- TIMELINE RULES — STRICTLY ENFORCED:
167
- - Merge the chunk summaries into 3-7 chronological segments.
168
- - Each segment MUST cover a distinct phase or theme; do NOT repeat topics.
169
- - Segments must follow the natural progression of the video.
170
- - Each segment must include: title, summary, key_insight, why_it_matters.
171
-
172
- CATEGORY RULE:
173
- - Provide a single, concise category label (1-2 words max) in English.
174
- - This should be the most accurate high-level category for the video content.
175
- - Examples: "Programming", "Finance", "History", "Psychology", "Mathematics", "Cooking".
176
- - The suggested_category MUST always be in English regardless of the transcript language.
177
-
178
- CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
179
- {
180
- "title": "Inferred video title in transcript language",
181
- "detected_language": "English (or Arabic, etc.)",
182
- "summary": "Concise overall summary (3-5 sentences)",
183
- "segments": [
184
- {
185
- "title": "Segment title",
186
- "summary": "What this section covers (2-3 sentences)",
187
- "key_insight": "Most important point from this section",
188
- "why_it_matters": "Why this is valuable (1-2 sentences)"
189
- }
190
- ],
191
- "conclusion": "Final overall takeaway / closing conclusion",
192
- "topics": ["Topic1", "Topic2", "Topic3"],
193
- "suggested_category": "Programming"
194
- }
195
-
196
- OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
197
- """.strip()
198
-
199
- _REDUCE_USER = """
200
- Video Title: {video_title}
201
-
202
- The following are intermediate summaries extracted from {total_chunks} consecutive chunks
203
- of the video transcript. Merge them into ONE cohesive final summary.
204
-
205
- {merged_summaries}
206
-
207
- Merge into 3-7 chronological segments. Return ONLY the final JSON structure.
208
- """.strip()
209
-
210
-
211
  # ─────────────────────────────────────────────────────────────────────────────
212
  # LANGUAGE LABELS (simplified)
213
  # ─────────────────────────────────────────────────────────────────────────────
@@ -316,42 +187,18 @@ def _split_into_chunks(text: str, target_tokens: int = _CHUNK_TARGET_TOKENS) ->
316
  # ─────────────────────────────────────────────────────────────────────────────
317
 
318
  class NoteGenerator:
319
- """
320
- Generates structured study notes using Groq.
321
-
322
- Automatically selects between:
323
- - **Single-pass**: for short transcripts (< 8K tokens)
324
- - **Map-Reduce**: for long transcripts (≥ 8K tokens), splitting into
325
- chunks, summarizing each individually, then merging in a REDUCE pass.
326
-
327
- Uses a single model (llama-3.3-70b-versatile) for all phases and
328
- includes adaptive rate-limit retry (60s backoff on 413/429).
329
- """
330
 
331
  def __init__(self):
332
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
333
  self.client = Groq(api_key=self.api_key) if self.api_key else None
334
- self.model = _MODEL_PRIMARY
335
- self.chunk_delay = float(
336
- os.environ.get("GROQ_CHUNK_DELAY_SECONDS", "3")
337
- )
338
- logger.info(
339
- "🚀 NoteGenerator v5.1 initialized — model: %s, delay: %.1fs",
340
- self.model, self.chunk_delay,
341
- )
342
 
343
- # ── Low-level API call ──────────────────────────────────────────────
344
-
345
- def _chat(
346
- self,
347
- system: str,
348
- user: str,
349
- max_tokens: int = 4096,
350
- ) -> Optional[str]:
351
- """Send a chat completion request to Groq."""
352
  try:
353
  response = self.client.chat.completions.create(
354
- model=self.model,
355
  max_tokens=max_tokens,
356
  temperature=0.3,
357
  response_format={"type": "json_object"},
@@ -362,11 +209,9 @@ class NoteGenerator:
362
  )
363
  return response.choices[0].message.content
364
  except Exception as e:
365
- logger.error("❌ Groq API call failed (model=%s): %s", self.model, e)
366
  return None
367
 
368
- # ── Error fallback ──────────────────────────────────────────────────
369
-
370
  def _get_error_json(self, error_msg: str) -> Dict:
371
  return {
372
  "title": "Error in Generation",
@@ -375,208 +220,31 @@ class NoteGenerator:
375
  "segments": [],
376
  "conclusion": "",
377
  "topics": [],
378
- "suggested_category": "",
379
  }
380
 
381
- # ── Single-pass summarization (short transcripts) ───────────────────
382
-
383
- def _single_pass(self, transcript_text: str, video_title: str) -> Dict:
384
- """Process the entire transcript in one API call."""
385
- logger.info("📝 Single-pass summarization via %s", self.model)
386
 
 
387
  user_prompt = _SUMMARY_USER.format(
388
  video_title=video_title,
389
- transcript=transcript_text,
390
  )
391
 
392
  raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
393
  if raw is None:
394
- return self._get_error_json("Groq API call failed (single-pass).")
395
-
396
- return self._parse_and_validate(raw)
397
-
398
- # ── Map-Reduce summarization (long transcripts) ─────────────────────
399
-
400
- def _map_reduce(self, transcript_text: str, video_title: str) -> Dict:
401
- """
402
- Split transcript into chunks, summarize each (MAP), then merge (REDUCE).
403
- """
404
- chunks = _split_into_chunks(transcript_text)
405
- total = len(chunks)
406
- logger.info(
407
- "🗺️ Map-Reduce activated: %d chunks (delay=%.1fs between calls)",
408
- total, self.chunk_delay,
409
- )
410
-
411
- # ── MAP PHASE ───────────────────────────────────────────────────
412
- intermediate_results: List[Dict] = []
413
-
414
- for i, chunk in enumerate(chunks, start=1):
415
- chunk_tokens = _estimate_tokens(chunk)
416
- logger.info(
417
- " 📦 MAP chunk %d/%d (~%d est. tokens)...", i, total, chunk_tokens,
418
- )
419
-
420
- user_prompt = _MAP_USER.format(
421
- video_title=video_title,
422
- chunk_index=i,
423
- total_chunks=total,
424
- chunk_text=chunk,
425
- )
426
 
427
- # Retry loop with adaptive backoff on rate-limit errors
428
- raw = None
429
- for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
430
- raw = self._chat(
431
- _MAP_SYSTEM, user_prompt,
432
- max_tokens=2048,
433
- )
434
-
435
- if raw is not None:
436
- break # success
437
-
438
- # _chat() returns None on any exception. Check if it was a
439
- # rate-limit error (413 / 429) by inspecting the last
440
- # exception. We re-try with a 60s sleep.
441
- logger.warning(
442
- " ⚠️ MAP chunk %d/%d attempt %d/%d failed. "
443
- "Sleeping %ds for TPM window reset...",
444
- i, total, attempt, _RATE_LIMIT_MAX_RETRIES,
445
- _RATE_LIMIT_SLEEP_SECONDS,
446
- )
447
- time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
448
-
449
- if raw:
450
- try:
451
- parsed = json.loads(raw)
452
- intermediate_results.append(parsed)
453
- logger.info(" ✅ MAP chunk %d/%d done.", i, total)
454
- except json.JSONDecodeError as e:
455
- logger.warning(
456
- " ⚠️ MAP chunk %d/%d returned invalid JSON: %s", i, total, e,
457
- )
458
- else:
459
- logger.error(
460
- " ❌ MAP chunk %d/%d failed after %d retries. Skipping.",
461
- i, total, _RATE_LIMIT_MAX_RETRIES,
462
- )
463
-
464
- # Respect TPM limits — delay between consecutive API calls
465
- if i < total and self.chunk_delay > 0:
466
- logger.info(" ⏳ Sleeping %.1fs (TPM cooldown)...", self.chunk_delay)
467
- time.sleep(self.chunk_delay)
468
-
469
- if not intermediate_results:
470
- return self._get_error_json(
471
- "Map-Reduce failed: no chunks were successfully summarized."
472
- )
473
-
474
- # ── REDUCE PHASE ────────────────────────────────────────────────
475
- logger.info("🔗 REDUCE phase: merging %d intermediate summaries...", len(intermediate_results))
476
-
477
- # Build a readable merged text for the reduce prompt
478
- merged_parts: List[str] = []
479
- all_topics: List[str] = []
480
- detected_lang = "English"
481
-
482
- for idx, result in enumerate(intermediate_results, start=1):
483
- detected_lang = result.get("detected_language", detected_lang)
484
- chunk_summary = result.get("chunk_summary", "")
485
- key_points = result.get("key_points", [])
486
- topics = result.get("topics", [])
487
- all_topics.extend(topics)
488
-
489
- part = f"--- Chunk {idx} ---\n"
490
- part += f"Summary: {chunk_summary}\n"
491
- for kp in key_points:
492
- if isinstance(kp, dict):
493
- part += f"- {kp.get('title', '')}: {kp.get('detail', '')} "
494
- part += f"(Insight: {kp.get('insight', '')})\n"
495
- part += f"Topics: {', '.join(topics)}\n"
496
- merged_parts.append(part)
497
-
498
- merged_text = "\n".join(merged_parts)
499
-
500
- # Check if the merged text itself is within single-pass limits
501
- reduce_tokens = _estimate_tokens(merged_text)
502
- logger.info("🔗 REDUCE input: ~%d tokens", reduce_tokens)
503
-
504
- user_prompt = _REDUCE_USER.format(
505
- video_title=video_title,
506
- total_chunks=len(intermediate_results),
507
- merged_summaries=merged_text,
508
- )
509
-
510
- # Sleep before REDUCE to ensure TPM cooldown from last MAP call
511
- if self.chunk_delay > 0:
512
- logger.info(" ⏳ Sleeping %.1fs before REDUCE call...", self.chunk_delay)
513
- time.sleep(self.chunk_delay)
514
-
515
- # REDUCE with retry on rate-limit
516
- raw = None
517
- for attempt in range(1, _RATE_LIMIT_MAX_RETRIES + 1):
518
- raw = self._chat(_REDUCE_SYSTEM, user_prompt, max_tokens=4096)
519
- if raw is not None:
520
- break
521
- logger.warning(
522
- " ⚠️ REDUCE attempt %d/%d failed. Sleeping %ds...",
523
- attempt, _RATE_LIMIT_MAX_RETRIES, _RATE_LIMIT_SLEEP_SECONDS,
524
- )
525
- time.sleep(_RATE_LIMIT_SLEEP_SECONDS)
526
-
527
- if raw is None:
528
- return self._get_error_json("Groq API call failed (REDUCE phase after retries).")
529
-
530
- return self._parse_and_validate(raw)
531
-
532
- # ── JSON parsing + schema validation ────────────────────────────────
533
-
534
- def _parse_and_validate(self, raw_json: str) -> Dict:
535
- """Parse raw JSON string and validate against SummarySchema."""
536
  try:
537
- data = json.loads(raw_json)
538
  validated = SummarySchema(**data)
539
  return validated.model_dump()
540
  except (json.JSONDecodeError, ValidationError) as e:
541
- logger.error("❌ Schema validation failed: %s", e)
542
  return self._get_error_json(f"Validation Error: {str(e)}")
543
 
544
- # ── Public API (unchanged signature) ────────────────────────────────
545
-
546
- def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
547
- """
548
- Generate structured JSON summary from transcript.
549
-
550
- Automatically selects single-pass or Map-Reduce based on estimated
551
- token count. The return type is always a Dict matching SummarySchema.
552
- """
553
- if not self.client:
554
- return self._get_error_json("Groq API Key missing.")
555
-
556
- # Estimate total tokens for the full prompt
557
- full_prompt = _SUMMARY_USER.format(
558
- video_title=video_title,
559
- transcript=transcript_text,
560
- )
561
- total_tokens = _estimate_tokens(_SUMMARY_SYSTEM + full_prompt)
562
-
563
- logger.info(
564
- "📊 Token estimate: ~%d tokens (threshold: %d)",
565
- total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
566
- )
567
-
568
- if total_tokens < _SINGLE_PASS_TOKEN_LIMIT:
569
- return self._single_pass(transcript_text, video_title)
570
- else:
571
- logger.info(
572
- "⚡ Transcript too large for single-pass (%d ≥ %d). "
573
- "Activating Map-Reduce pipeline...",
574
- total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
575
- )
576
- return self._map_reduce(transcript_text, video_title)
577
-
578
- # ── Markdown formatting (unchanged) ─────────────────────────────────
579
-
580
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
581
  """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
582
  lang = json_notes.get("detected_language", "English")
 
1
  import json
2
  import os
3
+ from typing import Dict, Optional
 
 
4
 
5
+ import torch
6
+ from langdetect import detect, LangDetectException
7
  from pydantic import ValidationError
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
 
10
  from ..utils.logger import setup_logger
11
  from .schemas import SummarySchema
12
+ from .segmenter import TranscriptSegmenter
13
 
14
 
15
  logger = setup_logger(__name__)
16
 
17
 
18
  # ─────────────────────────────────────────────────────────────────────────────
19
+ # PROMPT TEMPLATES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # ─────────────────────────────────────────────────────────────────────────────
21
 
22
  _SUMMARY_SYSTEM = """
 
27
  - Detect the primary language of the transcript.
28
  - Every content field (title, summary, segments, conclusion) MUST be written entirely in that SAME detected language.
29
  - Do NOT mix languages. Arabic transcript -> everything in Arabic.
30
+ - Only the "detected_language" field itself is stated in English (e.g. "Arabic").
31
 
32
  TIMELINE RULES — STRICTLY ENFORCED:
33
  - Divide the transcript into chronological segments that follow its natural progression.
 
45
  - Topics should be specific and descriptive (e.g. "Python", "Machine Learning", "Neural Networks").
46
  - Do NOT use generic fixed categories.
47
 
 
 
 
 
 
 
48
  CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
49
  DO NOT CHANGE, OMIT, OR RENAME ANY KEYS.
50
  {
 
60
  }
61
  ],
62
  "conclusion": "Final overall takeaway / closing conclusion",
63
+ "topics": ["Topic1", "Topic2", "Topic3"]
 
64
  }
65
 
66
  OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
 
79
  """.strip()
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # ─────────────────────────────────────────────────────────────────────────────
83
  # LANGUAGE LABELS (simplified)
84
  # ─────────────────────────────────────────────────────────────────────────────
 
187
  # ─────────────────────────────────────────────────────────────────────────────
188
 
189
  class NoteGenerator:
190
+ """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
 
 
 
 
 
 
 
 
 
 
191
 
192
  def __init__(self):
193
  self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
194
  self.client = Groq(api_key=self.api_key) if self.api_key else None
195
+ self.model_id = "llama-3.3-70b-versatile"
196
+ logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
 
 
 
 
 
 
197
 
198
+ def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
 
 
 
 
 
 
 
 
199
  try:
200
  response = self.client.chat.completions.create(
201
+ model=self.model_id,
202
  max_tokens=max_tokens,
203
  temperature=0.3,
204
  response_format={"type": "json_object"},
 
209
  )
210
  return response.choices[0].message.content
211
  except Exception as e:
212
+ logger.error(f"❌ Groq API call failed: {e}")
213
  return None
214
 
 
 
215
  def _get_error_json(self, error_msg: str) -> Dict:
216
  return {
217
  "title": "Error in Generation",
 
220
  "segments": [],
221
  "conclusion": "",
222
  "topics": [],
 
223
  }
224
 
225
+ def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
226
+ """Generate structured JSON summary from transcript."""
227
+ if not self.client:
228
+ return self._get_error_json("Groq API Key missing.")
 
229
 
230
+ logger.info(f"📝 Summary generation started via {self.model_id}")
231
  user_prompt = _SUMMARY_USER.format(
232
  video_title=video_title,
233
+ transcript=transcript_text[:30000],
234
  )
235
 
236
  raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
237
  if raw is None:
238
+ return self._get_error_json("Groq API call failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  try:
241
+ data = json.loads(raw)
242
  validated = SummarySchema(**data)
243
  return validated.model_dump()
244
  except (json.JSONDecodeError, ValidationError) as e:
245
+ logger.error(f"❌ Schema validation failed: {e}")
246
  return self._get_error_json(f"Validation Error: {str(e)}")
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def format_notes_to_markdown(self, json_notes: Dict) -> str:
249
  """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
250
  lang = json_notes.get("detected_language", "English")
src/summarization/segmenter.py CHANGED
@@ -149,6 +149,39 @@ class TranscriptSegmenter:
149
 
150
  return segments
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def segment_transcript(
153
  self,
154
  transcript_data: Dict,
 
149
 
150
  return segments
151
 
152
+ def segment_text_by_words(
153
+ self,
154
+ text: str,
155
+ chunk_size: int = 350
156
+ ) -> List[str]:
157
+ """
158
+ Split plain text into fixed-size word chunks.
159
+
160
+ This provides deterministic chunking suitable for models with
161
+ strict token limits (e.g. mT5's 512-token input window).
162
+
163
+ Args:
164
+ text: Full transcript text
165
+ chunk_size: Maximum words per chunk (default: 350)
166
+
167
+ Returns:
168
+ List of text chunks, each up to chunk_size words
169
+ """
170
+ words = text.split()
171
+ if not words:
172
+ return []
173
+
174
+ chunks = []
175
+ for i in range(0, len(words), chunk_size):
176
+ chunk = " ".join(words[i:i + chunk_size])
177
+ chunks.append(chunk)
178
+
179
+ logger.info(
180
+ f"Split {len(words)} words into {len(chunks)} chunks "
181
+ f"(max {chunk_size} words each)"
182
+ )
183
+ return chunks
184
+
185
  def segment_transcript(
186
  self,
187
  transcript_data: Dict,