# ===============================================
# SCHOLAR LENS - RAG STREAMLIT APP
# ===============================================

# ===============================================
# IMPORTS & CONFIGURATION
# ===============================================

import streamlit as st  # Main web app framework
import os  # For environment variables
import pypdf  # For PDF text extraction
import numpy as np  # For numerical operations
import chromadb  # Vector database for storing embeddings
from sentence_transformers import SentenceTransformer  # For creating text embeddings
import google.generativeai as genai  # For Gemini LLM API
from typing import List, Dict, Any, Optional  # Type hints for better code clarity
import re  # For text processing

# EXPLANATION
# This section imports all the libraries we need. Streamlit creates our web interface, 
# pypdf reads PDF files, sentence-transformers creates embeddings (numerical representations of text),
# ChromaDB stores and searches these embeddings, and google.generativeai connects to Gemini AI.

# ===============================================
# CONFIGURABLE CONSTANTS
# ===============================================

SIMILARITY_THRESHOLD = 0.25  # Minimum similarity score to consider a chunk relevant
TOP_K_CHUNKS = 3  # Number of most relevant chunks to retrieve
CHUNK_SIZE = 300  # Target number of words per text chunk
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Free embedding model

# EXPLANATION
# These are settings you can easily change. SIMILARITY_THRESHOLD controls how relevant 
# a piece of text must be to your question. Lower values (like 0.1) are more lenient,
# higher values (like 0.5) are stricter. TOP_K_CHUNKS is how many pieces of text
# the app will consider when answering your question.

# ===============================================
# PDF EXTRACTION FUNCTION
# ===============================================

def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
    """Extract text from uploaded PDF file with page numbers."""
    try:
        pdf_reader = pypdf.PdfReader(pdf_file)  # Create PDF reader object
        pages_text = []  # List to store text from each page
        
        for page_num, page in enumerate(pdf_reader.pages):  # Loop through each page
            page_text = page.extract_text()  # Extract text from current page
            if page_text.strip():  # Only add non-empty pages
                pages_text.append({
                    'page_number': page_num + 1,  # Page numbers start from 1
                    'text': page_text.strip()  # Remove extra whitespace
                })
        
        return {
            'success': True,  # Indicate successful extraction
            'pages': pages_text,  # List of page dictionaries
            'total_pages': len(pages_text)  # Total number of pages processed
        }
    
    except Exception as e:  # Handle any errors during PDF processing
        return {
            'success': False,  # Indicate failure
            'error': str(e)  # Store error message
        }

# EXPLANATION
# This function takes a PDF file and converts it into text that our program can understand.
# It goes through each page one by one, extracts the text, and remembers which page
# each piece of text came from. This is important for citing sources later.

# ===============================================
# CHUNKING FUNCTION
# ===============================================

def create_chunks(pages_text: List[Dict]) -> List[Dict]:
    """Split text into smaller chunks while preserving page information."""
    chunks = []  # List to store all text chunks
    chunk_id = 0  # Unique identifier for each chunk
    
    for page_data in pages_text:  # Process each page
        page_num = page_data['page_number']  # Get page number
        text = page_data['text']  # Get page text
        words = text.split()  # Split text into individual words
        
        # Create chunks of approximately CHUNK_SIZE words
        for i in range(0, len(words), CHUNK_SIZE):  # Step through words in chunks
            chunk_words = words[i:i + CHUNK_SIZE]  # Get next group of words
            chunk_text = ' '.join(chunk_words)  # Join words back into text
            
            if len(chunk_words) > 20:  # Only keep substantial chunks (more than 20 words)
                chunks.append({
                    'id': chunk_id,  # Unique chunk identifier
                    'text': chunk_text,  # The actual text content
                    'page_number': page_num,  # Which page this came from
                    'word_count': len(chunk_words)  # How many words in this chunk
                })
                chunk_id += 1  # Increment for next chunk
    
    return chunks  # Return list of all chunks

# EXPLANATION
# This function breaks down long pages of text into smaller, manageable pieces called chunks.
# Think of it like cutting a long article into paragraphs. Each chunk remembers which page
# it came from. This helps the AI find relevant information more accurately and cite sources properly.

# ===============================================
# EMBEDDING LOADING FUNCTION
# ===============================================

@st.cache_resource  # Cache the model so it only loads once
def load_embedding_model():
    """Load the sentence transformer model for creating embeddings."""
    try:
        model = SentenceTransformer(EMBEDDING_MODEL)  # Load the embedding model
        return model  # Return loaded model
    except Exception as e:  # Handle loading errors
        st.error(f"Failed to load embedding model: {e}")  # Show error to user
        return None  # Return None to indicate failure

# EXPLANATION
# This function loads the AI model that converts text into numbers (embeddings).
# These numbers capture the meaning of the text, allowing the computer to understand
# which pieces of text are similar to your question. The @st.cache_resource decorator
# ensures this model only loads once, making the app faster.

# ===============================================
# VECTOR DATABASE CREATION & QUERY FUNCTIONS
# ===============================================

def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
    """Create ChromaDB vector database with embeddings."""
    try:
        client = chromadb.Client()  # Create ChromaDB client
        collection = client.create_collection("pdf_chunks")  # Create collection for our chunks
        
        texts = [chunk['text'] for chunk in chunks]  # Extract text from each chunk
        embeddings = embedding_model.encode(texts).tolist()  # Convert texts to embeddings
        
        # Add chunks to database with embeddings and metadata
        collection.add(
            embeddings=embeddings,  # The numerical representations
            documents=texts,  # The actual text content
            metadatas=[{  # Additional information about each chunk
                'page_number': chunk['page_number'],
                'chunk_id': chunk['id'],
                'word_count': chunk['word_count']
            } for chunk in chunks],
            ids=[str(chunk['id']) for chunk in chunks]  # Unique identifiers
        )
        
        return collection  # Return the created database
        
    except Exception as e:  # Handle database creation errors
        st.error(f"Failed to create vector database: {e}")  # Show error to user
        return None  # Return None to indicate failure

def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
    """Query the vector database for relevant chunks."""
    try:
        query_embedding = embedding_model.encode([query]).tolist()  # Convert question to embedding
        results = collection.query(  # Search the database
            query_embeddings=query_embedding,
            n_results=k  # Get top k most similar chunks
        )
        
        relevant_chunks = []  # List to store results
        
        # Process each result
        for i in range(len(results['documents'][0])):  # Loop through returned documents
            distance = results['distances'][0][i]  # Get similarity distance
            similarity = max(0, 1 - distance)  # Convert distance to similarity score
            
            # Only include chunks that meet our similarity threshold
            if similarity >= SIMILARITY_THRESHOLD:
                relevant_chunks.append({
                    'text': results['documents'][0][i],  # The chunk text
                    'page_number': results['metadatas'][0][i]['page_number'],  # Source page
                    'similarity': similarity,  # How relevant this chunk is
                    'chunk_id': results['metadatas'][0][i]['chunk_id']  # Unique identifier
                })
        
        return relevant_chunks  # Return list of relevant chunks
        
    except Exception as e:  # Handle query errors
        st.error(f"Failed to query database: {e}")  # Show error to user
        return []  # Return empty list

# EXPLANATION
# These functions create and search our vector database. The database stores the meaning
# of each text chunk as numbers. When you ask a question, it converts your question to
# numbers and finds chunks with similar numbers (similar meaning). The similarity threshold
# determines how closely related the text must be to your question.

# ===============================================
# LLM WRAPPER FOR GEMINI
# ===============================================

def setup_gemini() -> bool:
    """Configure Gemini API if key is available."""
    try:
        # Try to get API key from Streamlit secrets first
        api_key = st.secrets.get("GEMINI_API_KEY")  # Check secrets
        
        if not api_key:  # If not in secrets, try environment variable
            api_key = os.getenv("GEMINI_API_KEY")  # Check environment
        
        if api_key:  # If we found an API key
            genai.configure(api_key=api_key)  # Configure Gemini with the key
            return True  # Indicate success
        else:
            return False  # No API key found
            
    except Exception as e:  # Handle setup errors
        st.error(f"Failed to setup Gemini: {e}")  # Show error to user
        return False  # Indicate failure

def generate_answer_with_gemini(query: str, relevant_chunks: List[Dict]) -> str:
    """Generate answer using Gemini with retrieved chunks as context."""
    try:
        # Create context from relevant chunks
        context_parts = []  # List to build context
        for chunk in relevant_chunks:  # Add each relevant chunk to context
            context_parts.append(f"[Page {chunk['page_number']}]: {chunk['text']}")
        
        context = "\n\n".join(context_parts)  # Join all context parts
        
        # Create prompt for Gemini
        prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question. 

Context:
{context}

Question: {query}

Instructions:
- Answer using ONLY the information provided in the context above
- If the context does not contain enough information to answer the question, reply exactly: ❌ Insufficient evidence
- Always include page citations in your answer using the format [Page X]
- Be accurate and concise
- Do not add information not present in the context

Answer:"""

        model = genai.GenerativeModel('gemini-pro')  # Create Gemini model instance
        response = model.generate_content(  # Generate response
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.1,  # Low temperature for consistent, factual responses
                max_output_tokens=500  # Limit response length
            )
        )
        
        return response.text  # Return the generated answer
        
    except Exception as e:  # Handle generation errors
        return f"Error generating answer: {str(e)}"  # Return error message

# EXPLANATION
# These functions handle the AI that generates answers. Gemini reads your question and
# the relevant chunks we found, then creates an answer based only on that information.
# The low temperature setting makes the AI more factual and less creative, which is
# important for accurate answers.

# ===============================================
# ANSWER GENERATION FUNCTION
# ===============================================

def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
    """Main function to generate answers using available LLM."""
    if not relevant_chunks:  # If no relevant chunks found
        return "❌ Insufficient evidence"  # Return standard message
    
    # Try Gemini if available
    if setup_gemini():  # Check if Gemini is configured
        return generate_answer_with_gemini(query, relevant_chunks)  # Use Gemini
    else:
        # Fallback response when no LLM is available
        return "❌ No LLM configured. Please add GEMINI_API_KEY to your secrets."

# EXPLANATION
# This is the main function that decides which AI to use for generating answers.
# It first checks if we found any relevant information, then tries to use Gemini.
# If no AI is available, it tells you to add an API key.

# ===============================================
# STREAMLIT UI
# ===============================================

def main():
    """Main Streamlit application."""
    
    # Page configuration
    st.set_page_config(  # Configure the web page
        page_title="Scholar Lens",  # Browser tab title
        page_icon="🔍",  # Browser tab icon
        layout="wide"  # Use full width of browser
    )
    
    # Header
    st.title("🔍 Scholar Lens")  # Main app title
    st.markdown("**AI-Powered Document Q&A System**")  # Subtitle
    st.markdown("Upload a PDF and ask questions about its content!")  # Description
    
    # Initialize session state for storing data
    if 'vector_db' not in st.session_state:  # Check if database exists in session
        st.session_state.vector_db = None  # Initialize as None
    if 'embedding_model' not in st.session_state:  # Check if model exists in session
        st.session_state.embedding_model = None  # Initialize as None
    
    # Load embedding model
    if st.session_state.embedding_model is None:  # If model not loaded
        with st.spinner("Loading embedding model..."):  # Show loading spinner
            st.session_state.embedding_model = load_embedding_model()  # Load model
    
    # File upload section
    st.header("📄 Upload Your PDF")  # Section header
    uploaded_file = st.file_uploader(  # File upload widget
        "Choose a PDF file",
        type="pdf",  # Only allow PDF files
        help="Upload a PDF document to analyze"  # Help text
    )
    
    # Process uploaded file
    if uploaded_file is not None:  # If user uploaded a file
        if st.button("🔄 Process PDF"):  # Process button
            with st.spinner("Processing PDF..."):  # Show processing spinner
                
                # Extract text from PDF
                pdf_result = extract_text_from_pdf(uploaded_file)  # Extract text
                
                if pdf_result['success']:  # If extraction successful
                    st.success(f"✅ Successfully processed {pdf_result['total_pages']} pages")  # Show success
                    
                    # Create chunks
                    chunks = create_chunks(pdf_result['pages'])  # Split text into chunks
                    st.info(f"📝 Created {len(chunks)} text chunks")  # Show chunk count
                    
                    # Create vector database
                    if st.session_state.embedding_model:  # If embedding model is loaded
                        st.session_state.vector_db = create_vector_database(  # Create database
                            chunks, st.session_state.embedding_model
                        )
                        
                        if st.session_state.vector_db:  # If database created successfully
                            st.success("✅ Vector database created successfully!")  # Show success
                        else:
                            st.error("❌ Failed to create vector database")  # Show error
                    else:
                        st.error("❌ Embedding model not available")  # Show model error
                        
                else:
                    st.error(f"❌ Failed to process PDF: {pdf_result['error']}")  # Show extraction error
    
    # Question answering section
    if st.session_state.vector_db is not None:  # If database is ready
        st.header("❓ Ask Questions")  # Section header
        
        # Question input
        question = st.text_input(  # Text input for questions
            "Enter your question:",
            placeholder="What is the main topic of this document?",  # Placeholder text
            help="Ask specific questions about the content of your PDF"  # Help text
        )
        
        # Answer generation
        if st.button("🔍 Ask") and question.strip():  # Ask button and non-empty question
            with st.spinner("Finding answer..."):  # Show searching spinner
                
                # Query vector database
                relevant_chunks = query_vector_database(  # Search for relevant chunks
                    st.session_state.vector_db,
                    question,
                    st.session_state.embedding_model
                )
                
                if relevant_chunks:  # If relevant chunks found
                    # Generate answer
                    answer = generate_answer(question, relevant_chunks)  # Get AI answer
                    
                    # Display results
                    st.subheader("💬 Answer")  # Answer section header
                    st.write(answer)  # Display the answer
                    
                    # Display source chunks
                    st.subheader("📚 Sources")  # Sources section header
                    for i, chunk in enumerate(relevant_chunks):  # Loop through sources
                        with st.expander(f"Source {i+1} - Page {chunk['page_number']} (Similarity: {chunk['similarity']:.2f})"):  # Expandable source
                            st.write(chunk['text'])  # Display chunk text
                
                else:
                    st.warning("❌ Insufficient evidence")  # No relevant chunks found
    
    else:
        st.info("👆 Please upload and process a PDF to start asking questions")  # Instruction message
    
    # Configuration section in sidebar
    with st.sidebar:  # Sidebar section
        st.header("⚙️ Configuration")  # Sidebar header
        st.write(f"**Similarity Threshold:** {SIMILARITY_THRESHOLD}")  # Display current threshold
        st.write(f"**Top K Chunks:** {TOP_K_CHUNKS}")  # Display current top k
        st.write(f"**Chunk Size:** {CHUNK_SIZE} words")  # Display chunk size
        
        st.markdown("---")  # Horizontal line separator
        st.markdown("**How to adjust settings:**")  # Instructions header
        st.markdown("- Edit constants at the top of `app.py`")  # Instruction 1
        st.markdown("- Lower threshold = more lenient matching")  # Instruction 2
        st.markdown("- Higher threshold = stricter matching")  # Instruction 3
        
        # Debug section
        st.markdown("---")  # Horizontal line separator
        st.header("🔧 Debug Info")  # Debug section header
        
        # Check if secrets are accessible
        try:
            if hasattr(st, 'secrets'):  # Check if secrets object exists
                available_secrets = list(st.secrets.keys()) if st.secrets else []  # Get secret keys
                st.write(f"**Available secrets:** {len(available_secrets)}")  # Show count
                if 'GEMINI_API_KEY' in available_secrets:  # Check if our key exists
                    st.success("✅ GEMINI_API_KEY found in secrets")  # Success message
                else:
                    st.error("❌ GEMINI_API_KEY not found in secrets")  # Error message
                    st.write(f"Available keys: {available_secrets}")  # Show what's available
            else:
                st.warning("⚠️ st.secrets not accessible")  # Warning message
        except Exception as e:
            st.error(f"Debug error: {e}")  # Show debug errors
    
    # Professional Footer
    st.markdown("""
        <div class="footer">
            <p style="margin: 0; font-size: 1rem;">
                Made with ❤️ using Streamlit & Gemini | © 2025 Anaa Jafar
            </p>
        </div>
    """, unsafe_allow_html=True)  # Display centered professional footer

# EXPLANATION
# The UI improvements transform the app into a professional, modern interface:
# 1. **Centered Layout**: Custom CSS limits max-width to 900px and centers content
# 2. **Professional Theme**: Gradient header, white containers with shadows, soft gray background
# 3. **Chat-Style Display**: Answers appear in rounded, shaded containers like chat bubbles
# 4. **Enhanced Sidebar**: Organized into clear sections (About App, Settings, Developer info)  
# 5. **Better Visual Hierarchy**: Icons, proper spacing, styled buttons and containers
# 6. **Responsive Design**: Uses columns for button centering and better mobile experience
# 7. **Color Coding**: Different colors for different types of information (success=green, warning=yellow)
# These changes make the app more visually appealing and professional while keeping all backend logic intact.

# EXPLANATION
# This is the main user interface of our app. It creates the web page with sections for
# uploading PDFs, processing them, asking questions, and showing answers. The sidebar
# shows current settings. Everything is organized in a logical flow from upload to Q&A.

# ===============================================
# RUN THE APPLICATION
# ===============================================

if __name__ == "__main__":  # Only run if this file is executed directly
    main()  # Start the Streamlit app

# EXPLANATION
# This final section starts our app when the Python file is run. It's like pressing
# the "start" button for our Scholar Lens application.