Spaces:

ATInc1
/

AIdea-Server

Running

App Files Files Community

Ali Hashhash commited on 18 days ago

Commit

1d88d91

1 Parent(s): e0ffc4f

feat: add note_generator module to handle automated summarization tasks

Browse files

Files changed (1) hide show

src/summarization/note_generator.py +384 -17

src/summarization/note_generator.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import os
-from typing import Dict, Optional
 from groq import Groq
 from pydantic import ValidationError
@@ -13,7 +15,23 @@ logger = setup_logger(__name__)
 # ─────────────────────────────────────────────────────────────────────────────
-# PROMPT TEMPLATES
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
@@ -76,6 +94,102 @@ Return ONLY the exact JSON structure requested.
 """.strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
@@ -105,23 +219,122 @@ def _labels(language: str) -> dict:
     return _LABELS.get(language, _LABELS["English"])
 # ─────────────────────────────────────────────────────────────────────────────
 # NOTE GENERATOR
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
-    """Generates structured study notes using Groq (Llama-3.3-70b-versatile)."""
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
-        self.model_id = "llama-3.3-70b-versatile"
-        logger.info(f"🚀 NoteGenerator v4.0 initialized — model: {self.model_id}")
-    def _chat(self, system: str, user: str, max_tokens: int = 4096) -> Optional[str]:
         try:
             response = self.client.chat.completions.create(
-                model=self.model_id,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
@@ -132,9 +345,11 @@ class NoteGenerator:
             )
             return response.choices[0].message.content
         except Exception as e:
-            logger.error(f"❌ Groq API call failed: {e}")
             return None
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
@@ -145,29 +360,181 @@ class NoteGenerator:
             "topics": [],
         }
-    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
-        """Generate structured JSON summary from transcript."""
-        if not self.client:
-            return self._get_error_json("Groq API Key missing.")
-        logger.info(f"📝 Summary generation started via {self.model_id}")
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
-            transcript=transcript_text[:30000],
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
-            return self._get_error_json("Groq API call failed.")
         try:
-            data = json.loads(raw)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
-            logger.error(f"❌ Schema validation failed: {e}")
             return self._get_error_json(f"Validation Error: {str(e)}")
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")

 import json
 import os
+import re
+import time
+from typing import Dict, List, Optional
 from groq import Groq
 from pydantic import ValidationError
 # ─────────────────────────────────────────────────────────────────────────────
+# CONFIGURATION
+# ─────────────────────────────────────────────────────────────────────────────
+# Token threshold: below this, a single API call is used.
+_SINGLE_PASS_TOKEN_LIMIT = 8_000
+# Target chunk size for MAP phase (tokens). Leaves room for prompt + response
+# within the 12K TPM free-tier limit.
+_CHUNK_TARGET_TOKENS = 6_000
+# Models
+_MODEL_PRIMARY = "llama-3.3-70b-versatile"  # REDUCE phase + single-pass
+_MODEL_MAP = "llama-3.1-8b-instant"          # MAP phase (fast, cheap)
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — SINGLE-PASS (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 _SUMMARY_SYSTEM = """
 """.strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — MAP PHASE
+# ─────────────────────────────────────────────────────────────────────────────
+_MAP_SYSTEM = """
+You are an expert educational content analyst.
+You will receive ONE CHUNK of a longer video transcript.
+Extract the key information from this chunk ONLY.
+LANGUAGE RULE — CRITICAL:
+- Detect the primary language of the text.
+- Write ALL content fields in that SAME detected language.
+- Only "detected_language" is stated in English.
+Return a JSON object with this EXACT structure:
+{
+    "detected_language": "English (or Arabic, etc.)",
+    "chunk_summary": "Concise summary of this chunk (3-5 sentences)",
+    "key_points": [
+        {
+            "title": "Short title for this point",
+            "detail": "1-2 sentence explanation",
+            "insight": "Key takeaway"
+        }
+    ],
+    "topics": ["Topic1", "Topic2"]
+}
+RULES:
+- Extract 2-4 key points from this chunk.
+- Topics should be specific (e.g. "Python", "Neural Networks"), not generic.
+- OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
+""".strip()
+_MAP_USER = """
+Video Title: {video_title}
+Chunk {chunk_index} of {total_chunks}:
+{chunk_text}
+Extract the key information from this chunk. Return ONLY the JSON.
+""".strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# PROMPT TEMPLATES — REDUCE PHASE
+# ─────────────────────────────────────────────────────────────────────────────
+_REDUCE_SYSTEM = """
+You are an expert educational content analyst and structured note-taking specialist.
+You will receive INTERMEDIATE SUMMARIES from multiple chunks of a single video transcript.
+Your job is to MERGE them into ONE final, cohesive, structured summary.
+LANGUAGE RULE — CRITICAL, NEVER VIOLATE:
+- Use the detected language from the intermediate summaries.
+- Every content field MUST be in that SAME language.
+- Only "detected_language" is stated in English.
+TIMELINE RULES — STRICTLY ENFORCED:
+- Merge the chunk summaries into 3-7 chronological segments.
+- Each segment MUST cover a distinct phase or theme; do NOT repeat topics.
+- Segments must follow the natural progression of the video.
+- Each segment must include: title, summary, key_insight, why_it_matters.
+CRITICAL: RETURN A JSON OBJECT EXACTLY MATCHING THIS STRUCTURE.
+{
+    "title": "Inferred video title in transcript language",
+    "detected_language": "English (or Arabic, etc.)",
+    "summary": "Concise overall summary (3-5 sentences)",
+    "segments": [
+        {
+            "title": "Segment title",
+            "summary": "What this section covers (2-3 sentences)",
+            "key_insight": "Most important point from this section",
+            "why_it_matters": "Why this is valuable (1-2 sentences)"
+        }
+    ],
+    "conclusion": "Final overall takeaway / closing conclusion",
+    "topics": ["Topic1", "Topic2", "Topic3"]
+}
+OUTPUT: Return ONLY a valid JSON object. No markdown fences, no extra text.
+""".strip()
+_REDUCE_USER = """
+Video Title: {video_title}
+The following are intermediate summaries extracted from {total_chunks} consecutive chunks
+of the video transcript. Merge them into ONE cohesive final summary.
+{merged_summaries}
+Merge into 3-7 chronological segments. Return ONLY the final JSON structure.
+""".strip()
 # ─────────────────────────────────────────────────────────────────────────────
 # LANGUAGE LABELS (simplified)
 # ─────────────────────────────────────────────────────────────────────────────
     return _LABELS.get(language, _LABELS["English"])
+# ─────────────────────────────────────────────────────────────────────────────
+# TOKEN UTILITIES
+# ─────────────────────────────────────────────────────────────────────────────
+def _estimate_tokens(text: str) -> int:
+    """
+    Lightweight token estimation using a word-count heuristic.
+    LLM tokenizers typically produce ~1.3 tokens per whitespace-delimited word
+    for English. Arabic and mixed-script text can be slightly higher, but 1.3
+    is a safe, conservative multiplier.
+    """
+    word_count = len(text.split())
+    return int(word_count * 1.3)
+def _split_into_chunks(text: str, target_tokens: int = _CHUNK_TARGET_TOKENS) -> List[str]:
+    """
+    Split text into chunks of approximately `target_tokens` tokens each.
+    Splits on sentence boundaries (period + space, newline) to avoid
+    cutting mid-sentence. Falls back to word-level splitting if no
+    sentence boundaries are found within a chunk.
+    """
+    # Split into sentences (on ". " or newline)
+    sentences = re.split(r'(?<=[.!?])\s+|\n+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    chunks: List[str] = []
+    current_chunk: List[str] = []
+    current_tokens = 0
+    for sentence in sentences:
+        sentence_tokens = _estimate_tokens(sentence)
+        # If a single sentence exceeds the target, split by words
+        if sentence_tokens > target_tokens:
+            # Flush current chunk first
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_tokens = 0
+            words = sentence.split()
+            word_buffer: List[str] = []
+            buffer_tokens = 0
+            for word in words:
+                wt = _estimate_tokens(word)
+                if buffer_tokens + wt > target_tokens and word_buffer:
+                    chunks.append(" ".join(word_buffer))
+                    word_buffer = [word]
+                    buffer_tokens = wt
+                else:
+                    word_buffer.append(word)
+                    buffer_tokens += wt
+            if word_buffer:
+                chunks.append(" ".join(word_buffer))
+            continue
+        if current_tokens + sentence_tokens > target_tokens and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentence]
+            current_tokens = sentence_tokens
+        else:
+            current_chunk.append(sentence)
+            current_tokens += sentence_tokens
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
 # ─────────────────────────────────────────────────────────────────────────────
 # NOTE GENERATOR
 # ─────────────────────────────────────────────────────────────────────────────
 class NoteGenerator:
+    """
+    Generates structured study notes using Groq.
+    Automatically selects between:
+    - **Single-pass**: for short transcripts (< 8K tokens)
+    - **Map-Reduce**: for long transcripts (≥ 8K tokens), splitting into
+      chunks, summarizing each with a fast model, then merging with the
+      primary model.
+    """
     def __init__(self):
         self.api_key = os.environ.get("GROQ_API_KEY", "").strip()
         self.client = Groq(api_key=self.api_key) if self.api_key else None
+        self.model_primary = _MODEL_PRIMARY
+        self.model_map = _MODEL_MAP
+        self.chunk_delay = float(
+            os.environ.get("GROQ_CHUNK_DELAY_SECONDS", "3")
+        )
+        logger.info(
+            "🚀 NoteGenerator v5.0 initialized — primary: %s, map: %s, delay: %.1fs",
+            self.model_primary, self.model_map, self.chunk_delay,
+        )
+    # ── Low-level API call ──────────────────────────────────────────────
+    def _chat(
+        self,
+        system: str,
+        user: str,
+        model: Optional[str] = None,
+        max_tokens: int = 4096,
+    ) -> Optional[str]:
+        """Send a chat completion request to Groq."""
+        model = model or self.model_primary
         try:
             response = self.client.chat.completions.create(
+                model=model,
                 max_tokens=max_tokens,
                 temperature=0.3,
                 response_format={"type": "json_object"},
             )
             return response.choices[0].message.content
         except Exception as e:
+            logger.error("❌ Groq API call failed (model=%s): %s", model, e)
             return None
+    # ── Error fallback ──────────────────────────────────────────────────
     def _get_error_json(self, error_msg: str) -> Dict:
         return {
             "title": "Error in Generation",
             "topics": [],
         }
+    # ── Single-pass summarization (short transcripts) ───────────────────
+    def _single_pass(self, transcript_text: str, video_title: str) -> Dict:
+        """Process the entire transcript in one API call."""
+        logger.info("📝 Single-pass summarization via %s", self.model_primary)
         user_prompt = _SUMMARY_USER.format(
             video_title=video_title,
+            transcript=transcript_text,
         )
         raw = self._chat(_SUMMARY_SYSTEM, user_prompt, max_tokens=4096)
         if raw is None:
+            return self._get_error_json("Groq API call failed (single-pass).")
+        return self._parse_and_validate(raw)
+    # ── Map-Reduce summarization (long transcripts) ─────────────────────
+    def _map_reduce(self, transcript_text: str, video_title: str) -> Dict:
+        """
+        Split transcript into chunks, summarize each (MAP), then merge (REDUCE).
+        """
+        chunks = _split_into_chunks(transcript_text)
+        total = len(chunks)
+        logger.info(
+            "🗺️  Map-Reduce activated: %d chunks (delay=%.1fs between calls)",
+            total, self.chunk_delay,
+        )
+        # ── MAP PHASE ───────────────────────────────────────────────────
+        intermediate_results: List[Dict] = []
+        for i, chunk in enumerate(chunks, start=1):
+            chunk_tokens = _estimate_tokens(chunk)
+            logger.info(
+                "  📦 MAP chunk %d/%d (~%d tokens)...", i, total, chunk_tokens,
+            )
+            user_prompt = _MAP_USER.format(
+                video_title=video_title,
+                chunk_index=i,
+                total_chunks=total,
+                chunk_text=chunk,
+            )
+            raw = self._chat(
+                _MAP_SYSTEM, user_prompt,
+                model=self.model_map,
+                max_tokens=2048,
+            )
+            if raw:
+                try:
+                    parsed = json.loads(raw)
+                    intermediate_results.append(parsed)
+                    logger.info("  ✅ MAP chunk %d/%d done.", i, total)
+                except json.JSONDecodeError as e:
+                    logger.warning(
+                        "  ⚠️ MAP chunk %d/%d returned invalid JSON: %s", i, total, e,
+                    )
+            else:
+                logger.warning("  ⚠️ MAP chunk %d/%d returned no response.", i, total)
+            # Respect TPM limits — delay between consecutive API calls
+            if i < total and self.chunk_delay > 0:
+                logger.info("  ⏳ Sleeping %.1fs (TPM cooldown)...", self.chunk_delay)
+                time.sleep(self.chunk_delay)
+        if not intermediate_results:
+            return self._get_error_json(
+                "Map-Reduce failed: no chunks were successfully summarized."
+            )
+        # ── REDUCE PHASE ────────────────────────────────────────────────
+        logger.info("🔗 REDUCE phase: merging %d intermediate summaries...", len(intermediate_results))
+        # Build a readable merged text for the reduce prompt
+        merged_parts: List[str] = []
+        all_topics: List[str] = []
+        detected_lang = "English"
+        for idx, result in enumerate(intermediate_results, start=1):
+            detected_lang = result.get("detected_language", detected_lang)
+            chunk_summary = result.get("chunk_summary", "")
+            key_points = result.get("key_points", [])
+            topics = result.get("topics", [])
+            all_topics.extend(topics)
+            part = f"--- Chunk {idx} ---\n"
+            part += f"Summary: {chunk_summary}\n"
+            for kp in key_points:
+                if isinstance(kp, dict):
+                    part += f"- {kp.get('title', '')}: {kp.get('detail', '')} "
+                    part += f"(Insight: {kp.get('insight', '')})\n"
+            part += f"Topics: {', '.join(topics)}\n"
+            merged_parts.append(part)
+        merged_text = "\n".join(merged_parts)
+        # Check if the merged text itself is within single-pass limits
+        reduce_tokens = _estimate_tokens(merged_text)
+        logger.info("🔗 REDUCE input: ~%d tokens", reduce_tokens)
+        user_prompt = _REDUCE_USER.format(
+            video_title=video_title,
+            total_chunks=len(intermediate_results),
+            merged_summaries=merged_text,
+        )
+        # REDUCE uses the primary (high-quality) model
+        # Sleep before REDUCE to ensure TPM cooldown from last MAP call
+        if self.chunk_delay > 0:
+            logger.info("  ⏳ Sleeping %.1fs before REDUCE call...", self.chunk_delay)
+            time.sleep(self.chunk_delay)
+        raw = self._chat(
+            _REDUCE_SYSTEM, user_prompt,
+            model=self.model_primary,
+            max_tokens=4096,
+        )
+        if raw is None:
+            return self._get_error_json("Groq API call failed (REDUCE phase).")
+        return self._parse_and_validate(raw)
+    # ── JSON parsing + schema validation ────────────────────────────────
+    def _parse_and_validate(self, raw_json: str) -> Dict:
+        """Parse raw JSON string and validate against SummarySchema."""
         try:
+            data = json.loads(raw_json)
             validated = SummarySchema(**data)
             return validated.model_dump()
         except (json.JSONDecodeError, ValidationError) as e:
+            logger.error("❌ Schema validation failed: %s", e)
             return self._get_error_json(f"Validation Error: {str(e)}")
+    # ── Public API (unchanged signature) ────────────────────────────────
+    def generateSummary(self, transcript_text: str, video_title: str) -> Dict:
+        """
+        Generate structured JSON summary from transcript.
+        Automatically selects single-pass or Map-Reduce based on estimated
+        token count. The return type is always a Dict matching SummarySchema.
+        """
+        if not self.client:
+            return self._get_error_json("Groq API Key missing.")
+        # Estimate total tokens for the full prompt
+        full_prompt = _SUMMARY_USER.format(
+            video_title=video_title,
+            transcript=transcript_text,
+        )
+        total_tokens = _estimate_tokens(_SUMMARY_SYSTEM + full_prompt)
+        logger.info(
+            "📊 Token estimate: ~%d tokens (threshold: %d)",
+            total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
+        )
+        if total_tokens < _SINGLE_PASS_TOKEN_LIMIT:
+            return self._single_pass(transcript_text, video_title)
+        else:
+            logger.info(
+                "⚡ Transcript too large for single-pass (%d ≥ %d). "
+                "Activating Map-Reduce pipeline...",
+                total_tokens, _SINGLE_PASS_TOKEN_LIMIT,
+            )
+            return self._map_reduce(transcript_text, video_title)
+    # ── Markdown formatting (unchanged) ─────────────────────────────────
     def format_notes_to_markdown(self, json_notes: Dict) -> str:
         """Convert JSON notes to clean Markdown — Summary → Timeline → Conclusion."""
         lang = json_notes.get("detected_language", "English")