Spaces:

mohhhhhit
/

notebook-backend

Sleeping

File size: 17,148 Bytes

3736c33

"""

NotebookLM-style response generator with professional formatting.

"""

from typing import List, Dict
import config
import re


class SimpleGenerator:
    """Lightweight generator with NotebookLM-quality formatting."""
    
    def __init__(self):
        self.ready = True
    
    def _clean_and_format_text(self, text: str) -> str:
        """Clean and format text with proper spacing like NotebookLM."""
        # Fix spacing after punctuation
        text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Add proper line breaks after sentences
        text = re.sub(r'([.!?])\s+', r'\1\n\n', text)
        return text.strip()
    
    def _extract_key_terms(self, text: str) -> List[str]:
        """Extract key terms that should be bolded."""
        # Look for capitalized terms, technical terms
        terms = []
        
        # Find terms in quotes
        quoted = re.findall(r'"([^"]+)"', text)
        terms.extend(quoted)
        
        # Find repeated important words (appear 2+ times)
        words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
        word_count = {}
        for word in words:
            word_count[word] = word_count.get(word, 0) + 1
        
        # Add words that appear multiple times
        terms.extend([w for w, count in word_count.items() if count >= 2])
        
        return list(set(terms))
    
    def _apply_bold_formatting(self, text: str) -> str:
        """Apply bold formatting to key terms like NotebookLM."""
        key_terms = self._extract_key_terms(text)
        
        # Bold key terms
        for term in key_terms:
            if len(term) > 3:  # Skip very short terms
                text = re.sub(rf'\b({re.escape(term)})\b', r'**\1**', text, count=1)
        
        # Bold specific patterns
        # Numbers with context
        text = re.sub(r'\b(\d+)\s+(observations?|years?|months?|quarters?)', r'**\1 \2**', text)
        
        return text
    
    def _create_structured_response(self, context: str, query: str) -> str:
        """Create a NotebookLM-style structured response."""
        # Split into paragraphs
        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
        
        # Remove duplicates
        unique_paras = []
        seen = set()
        for para in paragraphs:
            para_key = para.lower()[:150]
            if para_key not in seen:
                unique_paras.append(para)
                seen.add(para_key)
                if len(unique_paras) >= 5:
                    break
        
        if not unique_paras:
            return context[:1000]
        
        # Build NotebookLM-style response
        response = ""
        
        # Main explanation (first paragraph - cleaned and formatted)
        main_para = self._clean_and_format_text(unique_paras[0])
        main_para = self._apply_bold_formatting(main_para)
        response += main_para + "\n\n"
        
        # Add structured details if more content available
        if len(unique_paras) > 1:
            response += "### Key Points:\n\n"
            
            for i, para in enumerate(unique_paras[1:4], 1):
                # Extract first 2-3 sentences
                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
                if sentences:
                    detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.')
                    detail = self._apply_bold_formatting(detail)
                    response += f"{i}. {detail}\n\n"
        
        return response.strip()
    
    def generate_response(

        self,

        prompt: str,

        context: str = "",

        use_case: str = "explanation",

        metadatas: List[Dict] = None,

        **kwargs

    ) -> str:
        """

        Generate a NotebookLM-quality response with strict citations.

        

        Args:

            prompt: User query

            context: Retrieved context from documents

            use_case: Type of response (explanation, summary, qa,notes)

            metadatas: Metadata for each context chunk (for citations)

            

        Returns:

            Professional formatted response with inline citations

        """
        if not context:
            return (
                "I don't have enough information from your uploaded documents to answer this question. "
                "Please upload relevant study materials first, or try rephrasing your question."
            )
        
        # Use specialized prompts based on use case
        if use_case == "summary":
            response = self._create_summary_with_citations(context, prompt, metadatas)
        elif use_case == "notes":
            response = self._create_notes_with_citations(context, prompt, metadatas)
        elif use_case == "qa":
            response = self._create_qa_with_citations(context, prompt, metadatas)
        else:  # Default to explanation
            response = self._create_structured_response_with_citations(context, prompt, metadatas)
        
        return response
    
    def _create_structured_response_with_citations(

        self, 

        context: str, 

        query: str,

        metadatas: List[Dict] = None

    ) -> str:
        """Create NotebookLM-style response with inline citations."""
        # Split into paragraphs
        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
        
        # Remove duplicates
        unique_paras = []
        seen = set()
        for para in paragraphs:
            para_key = para.lower()[:150]
            if para_key not in seen:
                unique_paras.append(para)
                seen.add(para_key)
                if len(unique_paras) >= 5:
                    break
        
        if not unique_paras:
            return context[:1000]
        
        # Build response with citations
        response = ""
        
        # Main explanation (first paragraph - cleaned and formatted)
        main_para = self._clean_and_format_text(unique_paras[0])
        main_para = self._apply_bold_formatting(main_para)
        
        # Add citation to end of main paragraph
        cite_text = self._get_citation(0, metadatas) if metadatas else ""
        response += main_para + cite_text + "\n\n"
        
        # Add structured details if more content available
        if len(unique_paras) > 1:
            response += "### Key Points:\n\n"
            
            for i, para in enumerate(unique_paras[1:4], 1):
                # Extract first 2-3 sentences
                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
                if sentences:
                    detail = self._clean_and_format_text('. '.join(sentences[:2]) + '.')
                    detail = self._apply_bold_formatting(detail)
                    
                    # Add citation
                    cite_text = self._get_citation(i, metadatas) if metadatas and i < len(metadatas) else ""
                    response += f"{i}. {detail}{cite_text}\n\n"
        
        return response.strip()
    
    def _get_citation(self, index: int, metadatas: List[Dict] = None) -> str:
        """Generate inline citation from metadata."""
        if not metadatas or index >= len(metadatas):
            return ""
        
        meta = metadatas[index]
        filename = meta.get('filename', 'Unknown')
        
        # Remove file extension for cleaner citation
        clean_name = filename.replace('.pdf', '').replace('.docx', '').replace('.txt', '')
        
        return f" **[{clean_name}]**"
    
    def _create_summary_with_citations(

        self, 

        context: str, 

        query: str,

        metadatas: List[Dict] = None

    ) -> str:
        """Create a summary with citations."""
        sentences = []
        seen = set()
        for s in context.split('.'):
            s_clean = s.strip()
            if len(s_clean) > 40 and s_clean.lower() not in seen:
                sentences.append(s_clean)
                seen.add(s_clean.lower())
                if len(sentences) >= 6:
                    break
        
        if not sentences:
            return context[:800]
        
        response = "## Summary\n\n"
        for i, point in enumerate(sentences, 1):
            cite = self._get_citation(i-1, metadatas) if metadatas else ""
            response += f"{i}. {point}.{cite}\n\n"
        
        return response.strip()
    
    def _create_qa_with_citations(

        self, 

        context: str, 

        query: str,

        metadatas: List[Dict] = None

    ) -> str:
        """Answer with strict source grounding."""
        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
        
        if not paragraphs:
            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
            response = ' '.join(sentences[:6])
            cite = self._get_citation(0, metadatas) if metadatas else ""
            return response + cite
        
        # Remove duplicates
        unique_paras = []
        seen = set()
        for para in paragraphs:
            para_key = para.lower()[:150]
            if para_key not in seen:
                unique_paras.append(para)
                seen.add(para_key)
                if len(unique_paras) >= 3:
                    break
        
        # Fix spacing and add citations
        response = unique_paras[0] if unique_paras else context[:800]
        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
        cite = self._get_citation(0, metadatas) if metadatas else ""
        response += cite
        
        # Add supporting details if available
        if len(unique_paras) > 1:
            second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1])
            cite2 = self._get_citation(1, metadatas) if metadatas and len(metadatas) > 1 else ""
            response += "\n\n" + second_para + cite2
        
        return response.strip()
    
    def _create_notes_with_citations(

        self, 

        context: str, 

        query: str,

        metadatas: List[Dict] = None

    ) -> str:
        """Create study notes with source attribution."""
        sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40]
        
        # Remove duplicates
        unique_sections = []
        seen = set()
        for section in sections:
            section_key = section.lower()[:100]
            if section_key not in seen:
                unique_sections.append(section)
                seen.add(section_key)
                if len(unique_sections) >= 6:
                    break
        
        if not unique_sections:
            return context[:1000]
        
        response = "## Study Notes\n\n"
        
        for i, section in enumerate(unique_sections, 1):
            sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20]
            
            if sentences:
                heading = sentences[0]
                cite = self._get_citation(i-1, metadatas) if metadatas else ""
                response += f"### {i}. {heading}{cite}\n\n"
                
                for sent in sentences[1:3]:
                    response += f"- {sent}\n"
                response += "\n"
        
        return response.strip()
    
    def _create_summary(self, context: str, query: str) -> str:
        """Create a clean summary from retrieved context."""
        # Extract key sentences - remove duplicates
        sentences = []
        seen = set()
        for s in context.split('.'):
            s_clean = s.strip()
            # Remove duplicates and filter short/low-quality sentences
            if len(s_clean) > 40 and s_clean.lower() not in seen:
                sentences.append(s_clean)
                seen.add(s_clean.lower())
                if len(sentences) >= 6:
                    break
        
        if not sentences:
            return context[:800]
        
        response = "## Summary\n\n"
        for i, point in enumerate(sentences, 1):
            response += f"{i}. {point}.\n\n"
        
        return response.strip()
    
    def _create_explanation(self, context: str, query: str) -> str:
        """Create a well-formatted explanation from retrieved context."""
        # Remove duplicate paragraphs
        paragraphs = []
        seen = set()
        for para in context.split('\n\n'):
            para_clean = para.strip()
            # Keep unique, substantial paragraphs
            if len(para_clean) > 50:
                para_lower = para_clean.lower()[:200]  # Check first 200 chars for duplicates
                if para_lower not in seen:
                    paragraphs.append(para_clean)
                    seen.add(para_lower)
        
        if not paragraphs:
            # Fallback: split by sentence
            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
            return ' '.join(sentences[:8])
        
        # Build clean, formatted response with proper spacing
        response = ""
        
        # Add first paragraph as main explanation (ensure spacing between sentences)
        first_para = paragraphs[0]
        # Add space after punctuation if missing
        import re
        first_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', first_para)
        response += first_para
        
        # Add additional details if available
        if len(paragraphs) > 1:
            response += "\n\n### Key Points:\n\n"
            for i, para in enumerate(paragraphs[1:4], 1):  # Max 3 additional points
                # Extract first sentence as bullet
                sentences = [s.strip() for s in para.split('.') if len(s.strip()) > 20]
                if sentences:
                    response += f"• {sentences[0]}.\n"
                    if len(sentences) > 1 and len(sentences[1]) > 20:
                        response += f"  {sentences[1]}.\n"
                    response += "\n"
        
        return response.strip()
    
    def _create_qa(self, context: str, query: str) -> str:
        """Answer a question with clean formatting."""
        # Find most relevant paragraphs
        paragraphs = [p.strip() for p in context.split('\n\n') if len(p.strip()) > 50]
        
        if not paragraphs:
            sentences = [s.strip() + '.' for s in context.split('.') if len(s.strip()) > 30]
            return ' '.join(sentences[:6])
        
        # Remove duplicates
        unique_paras = []
        seen = set()
        for para in paragraphs:
            para_key = para.lower()[:150]
            if para_key not in seen:
                unique_paras.append(para)
                seen.add(para_key)
                if len(unique_paras) >= 3:
                    break
        
        # Fix spacing in response
        import re
        response = unique_paras[0] if unique_paras else context[:800]
        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)
        
        # Add supporting details if available
        if len(unique_paras) > 1:
            second_para = re.sub(r'([.!?])([A-Z])', r'\1 \2', unique_paras[1])
            response += "\n\n" + second_para
        
        return response.strip()
    
    def _create_notes(self, context: str, query: str) -> str:
        """Create well-structured study notes."""
        # Split and clean sections
        sections = [s.strip() for s in context.split('\n\n') if len(s.strip()) > 40]
        
        # Remove duplicates
        unique_sections = []
        seen = set()
        for section in sections:
            section_key = section.lower()[:100]
            if section_key not in seen:
                unique_sections.append(section)
                seen.add(section_key)
                if len(unique_sections) >= 6:
                    break
        
        if not unique_sections:
            return context[:1000]
        
        response = "## Study Notes\n\n"
        
        for i, section in enumerate(unique_sections, 1):
            # Extract key information
            sentences = [s.strip() for s in section.split('.') if len(s.strip()) > 20]
            
            if sentences:
                # Use first sentence as heading
                heading = sentences[0]
                response += f"### {i}. {heading}\n\n"
                
                # Add bullet points for remaining content
                for sent in sentences[1:3]:  # Max 2 additional sentences
                    response += f"- {sent}\n"
                response += "\n"
        
        return response.strip()