""" NotebookLM-style response generator with professional formatting. """ from typing import List, Dict import config import re class SimpleGenerator: """Lightweight generator with NotebookLM-quality formatting.""" def __init__(self): self.ready = True def _clean_and_format_text(self, text: str) -> str: """Clean and format text with proper spacing like NotebookLM.""" # Fix spacing after punctuation text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text) # Remove multiple spaces text = re.sub(r'\s+', ' ', text) # Add proper line breaks after sentences text = re.sub(r'([.!?])\s+', r'\1\n\n', text) return text.strip() def _extract_key_terms(self, text: str) -> List[str]: """Extract key terms that should be bolded.""" # Look for capitalized terms, technical terms terms = [] # Find terms in quotes quoted = re.findall(r'"([^"]+)"', text) terms.extend(quoted) # Find repeated important words (appear 2+ times) words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) word_count = {} for word in words: word_count[word] = word_count.get(word, 0) + 1 # Add words that appear multiple times terms.extend([w for w, count in word_count.items() if count >= 2]) return list(set(terms)) def _apply_bold_formatting(self, text: str) -> str: """Apply bold formatting to key terms like NotebookLM.""" key_terms = self._extract_key_terms(text) # Bold key terms for term in key_terms: if len(term) > 3: # Skip very short terms text = re.sub(rf'\b({re.escape(term)})\b', r'**\1**', text, count=1) # Bold specific patterns # Numbers with context text = re.sub(r'\b(\d+)\s+(observations?|years?|months?|quarters?)', r'**\1 \2**', text) return text def _create_structured_response(self, context: str, query: str) -> str: """Create a NotebookLM-style structured response.""" # Split into paragraphs paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50] # Remove duplicates unique_paras = [] seen = set() for para in paragraphs: para_key = para.lower()[:150] if para_key not in seen: unique_paras.append(para) seen.add(para_key) if len(unique_paras) >= 5: break if not unique_paras: return context[:1000] # Build NotebookLM-style response response = "" # Main explanation (first paragraph - cleaned and formatted) main_para = self._clean_and_format_text(unique_paras[0]) main_para = self._apply_bold_formatting(main_para) response += main_para + "\n\n" # Add structured details if more content available if len(unique_paras) > 1: response += "### Key Points:\n\n" for i, para in enumerate(unique_paras[1:4], 1): # Extract first 2-3 sentences sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20] if sentences: detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.') detail = self._apply_bold_formatting(detail) response += f"{i}. {detail}\n\n" return response.strip() def generate_response( self, prompt: str, context: str = "", use_case: str = "explanation", metadatas: List[Dict] = None, **kwargs ) -> str: """ Generate a NotebookLM-quality response with strict citations. Args: prompt: User query context: Retrieved context from documents use_case: Type of response (explanation, summary, qa,notes) metadatas: Metadata for each context chunk (for citations) Returns: Professional formatted response with inline citations """ if not context: return ( "I don't have enough information from your uploaded documents to answer this question. " "Please upload relevant study materials first, or try rephrasing your question." ) # Use specialized prompts based on use case if use_case == "summary": response = self._create_summary_with_citations(context, prompt, metadatas) elif use_case == "notes": response = self._create_notes_with_citations(context, prompt, metadatas) elif use_case == "qa": response = self._create_qa_with_citations(context, prompt, metadatas) else: # Default to explanation response = self._create_structured_response_with_citations(context, prompt, metadatas) return response def _create_structured_response_with_citations( self, context: str, query: str, metadatas: List[Dict] = None ) -> str: """Create NotebookLM-style response with inline citations.""" # Split into paragraphs paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50] # Remove duplicates unique_paras = [] seen = set() for para in paragraphs: para_key = para.lower()[:150] if para_key not in seen: unique_paras.append(para) seen.add(para_key) if len(unique_paras) >= 5: break if not unique_paras: return context[:1000] # Build response with citations response = "" # Main explanation (first paragraph - cleaned and formatted) main_para = self._clean_and_format_text(unique_paras[0]) main_para = self._apply_bold_formatting(main_para) # Add citation to end of main paragraph cite_text = self._get_citation(0, metadatas) if metadatas else "" response += main_para + cite_text + "\n\n" # Add structured details if more content available if len(unique_paras) > 1: response += "### Key Points:\n\n" for i, para in enumerate(unique_paras[1:4], 1): # Extract first 2-3 sentences sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20] if sentences: detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.') detail = self._apply_bold_formatting(detail) # Add citation cite_text = self._get_citation(i, metadatas) if metadatas and i < len(metadatas) else "" response += f"{i}. {detail}{cite_text}\n\n" return response.strip() def _get_citation(self, index: int, metadatas: List[Dict] = None) -> str: """Generate inline citation from metadata.""" if not metadatas or index >= len(metadatas): return "" meta = metadatas[index] filename = meta.get('filename', 'Unknown') # Remove file extension for cleaner citation clean_name = filename.replace('.pdf', '').replace('.docx', '').replace('.txt', '') return f" **[{clean_name}]**" def _create_summary_with_citations( self, context: str, query: str, metadatas: List[Dict] = None ) -> str: """Create a summary with citations.""" sentences = [] seen = set() for s in context.split('.'): s_clean = s.strip() if len(s_clean) > 40 and s_clean.lower() not in seen: sentences.append(s_clean) seen.add(s_clean.lower()) if len(sentences) >= 6: break if not sentences: return context[:800] response = "## Summary\n\n" for i, point in enumerate(sentences, 1): cite = self._get_citation(i-1, metadatas) if metadatas else "" response += f"{i}. {point}.{cite}\n\n" return response.strip() def _create_qa_with_citations( self, context: str, query: str, metadatas: List[Dict] = None ) -> str: """Answer with strict source grounding.""" paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50] if not paragraphs: sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30] response = ' '.join(sentences[:6]) cite = self._get_citation(0, metadatas) if metadatas else "" return response + cite # Remove duplicates unique_paras = [] seen = set() for para in paragraphs: para_key = para.lower()[:150] if para_key not in seen: unique_paras.append(para) seen.add(para_key) if len(unique_paras) >= 3: break # Fix spacing and add citations response = unique_paras[0] if unique_paras else context[:800] response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response) cite = self._get_citation(0, metadatas) if metadatas else "" response += cite # Add supporting details if available if len(unique_paras) > 1: second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1]) cite2 = self._get_citation(1, metadatas) if metadatas and len(metadatas) > 1 else "" response += "\n\n" + second_para + cite2 return response.strip() def _create_notes_with_citations( self, context: str, query: str, metadatas: List[Dict] = None ) -> str: """Create study notes with source attribution.""" sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40] # Remove duplicates unique_sections = [] seen = set() for section in sections: section_key = section.lower()[:100] if section_key not in seen: unique_sections.append(section) seen.add(section_key) if len(unique_sections) >= 6: break if not unique_sections: return context[:1000] response = "## Study Notes\n\n" for i, section in enumerate(unique_sections, 1): sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20] if sentences: heading = sentences[0] cite = self._get_citation(i-1, metadatas) if metadatas else "" response += f"### {i}. {heading}{cite}\n\n" for sent in sentences[1:3]: response += f"- {sent}\n" response += "\n" return response.strip() def _create_summary(self, context: str, query: str) -> str: """Create a clean summary from retrieved context.""" # Extract key sentences - remove duplicates sentences = [] seen = set() for s in context.split('.'): s_clean = s.strip() # Remove duplicates and filter short/low-quality sentences if len(s_clean) > 40 and s_clean.lower() not in seen: sentences.append(s_clean) seen.add(s_clean.lower()) if len(sentences) >= 6: break if not sentences: return context[:800] response = "## Summary\n\n" for i, point in enumerate(sentences, 1): response += f"{i}. {point}.\n\n" return response.strip() def _create_explanation(self, context: str, query: str) -> str: """Create a well-formatted explanation from retrieved context.""" # Remove duplicate paragraphs paragraphs = [] seen = set() for para in context.split('\n\n'): para_clean = para.strip() # Keep unique, substantial paragraphs if len(para_clean) > 50: para_lower = para_clean.lower()[:200] # Check first 200 chars for duplicates if para_lower not in seen: paragraphs.append(para_clean) seen.add(para_lower) if not paragraphs: # Fallback: split by sentence sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30] return ' '.join(sentences[:8]) # Build clean, formatted response with proper spacing response = "" # Add first paragraph as main explanation (ensure spacing between sentences) first_para = paragraphs[0] # Add space after punctuation if missing import re first_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', first_para) response += first_para # Add additional details if available if len(paragraphs) > 1: response += "\n\n### Key Points:\n\n" for i, para in enumerate(paragraphs[1:4], 1): # Max 3 additional points # Extract first sentence as bullet sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20] if sentences: response += f"• {sentences[0]}.\n" if len(sentences) > 1 and len(sentences[1]) > 20: response += f" {sentences[1]}.\n" response += "\n" return response.strip() def _create_qa(self, context: str, query: str) -> str: """Answer a question with clean formatting.""" # Find most relevant paragraphs paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50] if not paragraphs: sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30] return ' '.join(sentences[:6]) # Remove duplicates unique_paras = [] seen = set() for para in paragraphs: para_key = para.lower()[:150] if para_key not in seen: unique_paras.append(para) seen.add(para_key) if len(unique_paras) >= 3: break # Fix spacing in response import re response = unique_paras[0] if unique_paras else context[:800] response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response) # Add supporting details if available if len(unique_paras) > 1: second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1]) response += "\n\n" + second_para return response.strip() def _create_notes(self, context: str, query: str) -> str: """Create well-structured study notes.""" # Split and clean sections sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40] # Remove duplicates unique_sections = [] seen = set() for section in sections: section_key = section.lower()[:100] if section_key not in seen: unique_sections.append(section) seen.add(section_key) if len(unique_sections) >= 6: break if not unique_sections: return context[:1000] response = "## Study Notes\n\n" for i, section in enumerate(unique_sections, 1): # Extract key information sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20] if sentences: # Use first sentence as heading heading = sentences[0] response += f"### {i}. {heading}\n\n" # Add bullet points for remaining content for sent in sentences[1:3]: # Max 2 additional sentences response += f"- {sent}\n" response += "\n" return response.strip()