Spaces:

anamjafar6
/

study

Sleeping

App Files Files Community

anamjafar6 commited on Sep 27, 2025

Commit

b38cebf

verified ·

1 Parent(s): 7357dc8

Update app.py

Browse files

Files changed (1) hide show

app.py +420 -103

app.py CHANGED Viewed

@@ -1,178 +1,495 @@
-import streamlit as st
-import os
-import pypdf
-import chromadb
-from sentence_transformers import SentenceTransformer
-from groq import Groq
-from typing import List, Dict, Any, Optional
-# CONFIG
-SIMILARITY_THRESHOLD = 0.2
-TOP_K_CHUNKS = 3
-CHUNK_SIZE = 300
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# PDF extraction
 def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
     try:
-        pdf_reader = pypdf.PdfReader(pdf_file)
-        pages_text = []
-        for page_num, page in enumerate(pdf_reader.pages):
-            page_text = page.extract_text()
-            if page_text and page_text.strip():
                 pages_text.append({
-                    'page_number': page_num + 1,
-                    'text': page_text.strip()
                 })
-        return {"success": True, "pages": pages_text, "total_pages": len(pages_text)}
-    except Exception as e:
-        return {"success": False, "error": str(e)}
-# Chunking
 def create_chunks(pages_text: List[Dict]) -> List[Dict]:
     chunks = []
     chunk_id = 0
     for page_data in pages_text:
-        words = page_data['text'].split()
         for i in range(0, len(words), CHUNK_SIZE):
             chunk_words = words[i:i + CHUNK_SIZE]
-            if len(chunk_words) > 20:
                 chunks.append({
-                    "id": chunk_id,
-                    "text": " ".join(chunk_words),
-                    "page_number": page_data['page_number'],
-                    "word_count": len(chunk_words)
                 })
                 chunk_id += 1
     return chunks
-# Embedding model
 @st.cache_resource
 def load_embedding_model():
-    return SentenceTransformer(EMBEDDING_MODEL)
-# Vector database
 def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
     try:
         client = chromadb.Client()
-        # use get_or_create instead of create
-        collection = client.get_or_create_collection("pdf_chunks")
-        texts = [c['text'] for c in chunks]
         embeddings = embedding_model.encode(texts).tolist()
         collection.add(
             embeddings=embeddings,
             documents=texts,
             metadatas=[{
-                "page_number": c["page_number"],
-                "chunk_id": c["id"],
-                "word_count": c["word_count"]
-            } for c in chunks],
-            ids=[str(c["id"]) for c in chunks]
         )
         return collection
     except Exception as e:
-        st.error(f"Vector DB error: {e}")
         return None
 def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
     try:
-        query_emb = embedding_model.encode([query]).tolist()
-        results = collection.query(query_embeddings=query_emb, n_results=k)
         relevant_chunks = []
-        for i in range(len(results['documents'][0])):
-            distance = results['distances'][0][i]
-            similarity = max(0, 1 - distance)
             if similarity >= SIMILARITY_THRESHOLD:
                 relevant_chunks.append({
-                    "text": results['documents'][0][i],
-                    "page_number": results['metadatas'][0][i]["page_number"],
-                    "similarity": similarity,
-                    "chunk_id": results['metadatas'][0][i]["chunk_id"]
                 })
         return relevant_chunks
     except Exception as e:
-        st.error(f"Query error: {e}")
         return []
-# Groq setup
 def setup_groq():
-    api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
     if not api_key:
-        st.error("❌ No GROQ_API_KEY found. Please add it to secrets or env.")
         return None
-    return Groq(api_key=api_key)
 def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
     try:
-        context = "\n\n".join([f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks])
-        prompt = f"""
-        Based ONLY on the following context from a PDF document, answer the user's question.
-        Context:
-        {context}
-        Question: {query}
-        Instructions:
-        - Answer ONLY using info from the context above
-        - If not enough info, reply: ❌ Insufficient evidence
-        - Always include page citations like [Page X]
-        """
-        chat = client.chat.completions.create(
             model="llama3-8b-8192",
             messages=[
-                {"role": "system", "content": "You are a helpful tutor AI."},
                 {"role": "user", "content": prompt}
             ],
             temperature=0.1,
             max_tokens=500
         )
-        return chat.choices[0].message.content
     except Exception as e:
         return f"Error generating answer: {e}"
-# Main answer pipeline
 def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
     if not relevant_chunks:
         return "❌ Insufficient evidence"
     client = setup_groq()
-    if client:
-        return generate_answer_with_groq(client, query, relevant_chunks)
-    return "❌ No LLM configured."
-# -----------------------------
-# STREAMLIT MAIN
-# -----------------------------
 def main():
-    st.set_page_config(page_title="PageMentor", layout="wide")
-    st.title("📚 PageMentor")
-    if "vector_db" not in st.session_state:
         st.session_state.vector_db = None
-        st.session_state.embedding_model = load_embedding_model()
-    uploaded_file = st.file_uploader("Upload PDF", type="pdf")
-    if uploaded_file and st.button("🚀 Process PDF"):
-        pdf_result = extract_text_from_pdf(uploaded_file)
-        if pdf_result["success"]:
-            chunks = create_chunks(pdf_result["pages"])
-            st.session_state.vector_db = create_vector_database(chunks, st.session_state.embedding_model)
-            if st.session_state.vector_db:
-                st.success(f"✅ Processed {pdf_result['total_pages']} pages, {len(chunks)} chunks ready!")
-        else:
-            st.error(pdf_result["error"])
-    if st.session_state.vector_db:
-        query = st.text_input("Ask a question:")
-        if query and st.button("🔍 Get Answer"):
-            relevant_chunks = query_vector_database(st.session_state.vector_db, query, st.session_state.embedding_model)
-            answer = generate_answer(query, relevant_chunks)
-            st.markdown("### 🎯 Answer")
-            st.write(answer)
 if __name__ == "__main__":
     main()

+# PAGEMENTOR - ENHANCED UI/UX RAG STREAMLIT APP
+# IMPORTS & CONFIGURATION
+import streamlit as st  # Main web app framework
+import os  # For environment variables
+import pypdf  # For PDF text extraction
+import numpy as np  # For numerical operations
+import chromadb  # Vector database for storing embeddings
+from sentence_transformers import SentenceTransformer  # For creating text embeddings
+# Groq client (LLM) - will be used if available
+try:
+    from groq import Groq
+except Exception:
+    Groq = None
+from typing import List, Dict, Any, Optional  # Type hints for better code clarity
+import re  # For text processing
+from uuid import uuid4
+import time
+# CONFIGURABLE CONSTANTS
+SIMILARITY_THRESHOLD = 0.2  # Slightly lower so relevant chunks are not missed
+TOP_K_CHUNKS = 3  # Number of most relevant chunks to retrieve
+CHUNK_SIZE = 300  # Target number of words per text chunk
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Free embedding model
+# PDF EXTRACTION FUNCTION
 def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
+    """Extract text from uploaded PDF file with page numbers."""
     try:
+        pdf_reader = pypdf.PdfReader(pdf_file)  # Create PDF reader object
+        pages_text = []  # List to store text from each page
+        for page_num, page in enumerate(pdf_reader.pages):  # Loop through each page
+            page_text = page.extract_text() or ""  # Extract text (may return None)
+            if page_text and page_text.strip():  # Only add non-empty pages
                 pages_text.append({
+                    'page_number': page_num + 1,  # Page numbers start from 1
+                    'text': page_text.strip()  # Remove extra whitespace
                 })
+        return {
+            'success': True,
+            'pages': pages_text,
+            'total_pages': len(pages_text)
+        }
+    except Exception as e:  # Handle any errors during PDF processing
+        return {
+            'success': False,
+            'error': str(e)
+        }
+# CHUNKING FUNCTION
 def create_chunks(pages_text: List[Dict]) -> List[Dict]:
+    """Split text into smaller chunks while preserving page information."""
     chunks = []
     chunk_id = 0
     for page_data in pages_text:
+        page_num = page_data['page_number']
+        text = page_data['text']
+        words = text.split()
+        # Create chunks of approximately CHUNK_SIZE words
         for i in range(0, len(words), CHUNK_SIZE):
             chunk_words = words[i:i + CHUNK_SIZE]
+            chunk_text = ' '.join(chunk_words)
+            if len(chunk_words) > 20:  # Only keep substantial chunks (more than 20 words)
                 chunks.append({
+                    'id': chunk_id,
+                    'text': chunk_text,
+                    'page_number': page_num,
+                    'word_count': len(chunk_words)
                 })
                 chunk_id += 1
     return chunks
+# EMBEDDING LOADING FUNCTION
 @st.cache_resource
 def load_embedding_model():
+    """Load the sentence transformer model for creating embeddings."""
+    try:
+        model = SentenceTransformer(EMBEDDING_MODEL)
+        return model
+    except Exception as e:
+        st.error(f"Failed to load embedding model: {e}")
+        return None
+# VECTOR DATABASE CREATION & QUERY FUNCTIONS
 def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
+    """Create ChromaDB vector database with embeddings.
+    FIXES:
+    - Use a unique collection name per uploaded file to avoid "already exists" errors.
+    - Store collection reference and name in session_state so later queries use the right collection.
+    """
     try:
         client = chromadb.Client()
+        # create a unique collection name per upload to avoid conflicts
+        collection_name = f"pdf_chunks_{uuid4().hex[:8]}"
+        collection = client.create_collection(collection_name)
+        texts = [chunk['text'] for chunk in chunks]
         embeddings = embedding_model.encode(texts).tolist()
+        # Add chunks to database with embeddings and metadata
         collection.add(
             embeddings=embeddings,
             documents=texts,
             metadatas=[{
+                'page_number': chunk['page_number'],
+                'chunk_id': chunk['id'],
+                'word_count': chunk['word_count']
+            } for chunk in chunks],
+            ids=[str(chunk['id']) for chunk in chunks]
         )
+        # store collection name in session state so queries can reference it
+        st.session_state.collection_name = collection_name
         return collection
     except Exception as e:
+        st.error(f"Failed to create vector database: {e}")
         return None
 def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
+    """Query the vector database for relevant chunks."""
     try:
+        query_embedding = embedding_model.encode([query]).tolist()
+        results = collection.query(
+            query_embeddings=query_embedding,
+            n_results=k
+        )
         relevant_chunks = []
+        # Chroma returns lists in results; careful with indexing
+        docs = results.get('documents', [])
+        dists = results.get('distances', [])
+        metas = results.get('metadatas', [])
+        if not docs:
+            return []
+        for i in range(len(docs[0])):
+            distance = dists[0][i] if dists else 0
+            # Convert distance to similarity (works if distances in [0,1])
+            similarity = max(0, 1 - distance) if isinstance(distance, (int, float)) else 0
             if similarity >= SIMILARITY_THRESHOLD:
                 relevant_chunks.append({
+                    'text': docs[0][i],
+                    'page_number': metas[0][i].get('page_number') if metas else None,
+                    'similarity': similarity,
+                    'chunk_id': metas[0][i].get('chunk_id') if metas else None
                 })
         return relevant_chunks
     except Exception as e:
+        st.error(f"Failed to query database: {e}")
         return []
+# LLM WRAPPER FOR GROQ
 def setup_groq():
+    """Configure Groq client using GROQ_API_KEY from secrets or env."""
+    api_key = None
+    # Hugging Face / Streamlit secrets: try st.secrets first (HF sets as env, but we'll check both)
+    try:
+        api_key = st.secrets.get('GROQ_API_KEY')  # type: ignore
+    except Exception:
+        api_key = None
+    if not api_key:
+        api_key = os.getenv('GROQ_API_KEY')
     if not api_key:
+        st.error("❌ GROQ_API_KEY not found. Please add it to Hugging Face secrets or environment variables.")
         return None
+    if Groq is None:
+        st.error("❌ groq package not installed or failed to import. Add 'groq' to requirements.txt")
+        return None
+    try:
+        client = Groq(api_key=api_key)
+        return client
+    except Exception as e:
+        st.error(f"Failed to initialize Groq client: {e}")
+        return None
 def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
+    """Generate answer using Groq (chat/completions). Keep prompt strict to only use context.
+    NOTE: Groq client libraries and method names can change. This implementation uses a generic
+    chat completions call pattern; when deploying, if Groq client has different API you may need
+    to adjust the call accordingly. We surface clear error messages to help debugging.
+    """
     try:
+        # Build strict context with page citations
+        context_parts = [f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks]
+        context = "
+".join(context_parts)
+        prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question.
+Context:
+{context}
+Question: {query}
+Instructions:
+- Answer using ONLY the information provided in the context above
+- If the context does not contain enough information to answer the question, reply exactly: ❌ Insufficient evidence
+- Always include page citations in your answer using the format [Page X]
+- Be accurate and concise
+- Do not add information not present in the context
+Answer:"""
+        # Example chat-style call — adjust if Groq client exposes a different interface
+        chat_resp = client.chat.completions.create(
             model="llama3-8b-8192",
             messages=[
+                {"role": "system", "content": "You are a strict assistant that only uses provided context."},
                 {"role": "user", "content": prompt}
             ],
             temperature=0.1,
             max_tokens=500
         )
+        # Parse response depending on returned structure
+        if hasattr(chat_resp, 'choices'):
+            # SDK-style response
+            return chat_resp.choices[0].message.content
+        elif isinstance(chat_resp, dict):
+            # dict-style response
+            choices = chat_resp.get('choices') or []
+            if choices:
+                # try common paths
+                return choices[0].get('message', {}).get('content') or choices[0].get('text') or str(choices[0])
+        return str(chat_resp)
     except Exception as e:
         return f"Error generating answer: {e}"
+# ANSWER GENERATION FUNCTION
 def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
+    """Main function to generate answers using Groq; fallback to safe messages."""
     if not relevant_chunks:
         return "❌ Insufficient evidence"
     client = setup_groq()
+    if not client:
+        return "❌ No LLM configured. Please add GROQ_API_KEY to your secrets."
+    return generate_answer_with_groq(client, query, relevant_chunks)
+# STREAMLIT UI
 def main():
+    """Main Streamlit application."""
+    # Page configuration with wide layout for centered design
+    st.set_page_config(
+        page_title="PageMentor",
+        page_icon="📚",
+        layout="wide"
+    )
+    # Custom CSS (kept exactly as your original UI)
+    st.markdown("""
+        <style>
+        /* Center the main container with max width */
+        .main > div {
+            max-width: 900px;
+            margin: 0 auto;
+            padding: 2rem 1rem;
+        }
+        .stApp { background-color: #f8f9fa; }
+        .header-container { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
+        .header-title { color: white; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
+        .header-subtitle { color: rgba(255,255,255,0.9); font-size: 1.1rem; }
+        .answer-box { background-color: white; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 8px rgba(0,0,0,0.08); border-left: 4px solid #667eea; }
+        .source-card { background-color: #f0f2f6; border-radius: 10px; padding: 1rem; margin: 0.5rem 0; border-left: 3px solid #764ba2; }
+        .stButton > button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; border-radius: 8px; padding: 0.5rem 2rem; font-weight: 600; }
+        .stTextInput > div > div > input { border-radius: 8px; border: 2px solid #e0e0e0; padding: 0.75rem; }
+        .stTextInput > div > div > input:focus { border-color: #667eea; box-shadow: 0 0 0 2px rgba(102,126,234,0.1); }
+        .footer { text-align: center; padding: 2rem 0; margin-top: 3rem; border-top: 1px solid #e0e0e0; color: #666; }
+        </style>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+        <div class="header-container">
+            <div class="header-title">📚 PageMentor</div>
+            <div class="header-subtitle">Book-based AI Tutor - Learn from any PDF document</div>
+        </div>
+    """, unsafe_allow_html=True)
+    st.markdown("---")
+    # Initialize session state for storing data
+    if 'vector_db' not in st.session_state:
         st.session_state.vector_db = None
+    if 'embedding_model' not in st.session_state:
+        st.session_state.embedding_model = None
+    if 'processed_file' not in st.session_state:
+        st.session_state.processed_file = None
+    if 'collection_name' not in st.session_state:
+        st.session_state.collection_name = None
+    # Load embedding model
+    if st.session_state.embedding_model is None:
+        with st.spinner("🔄 Loading AI models..."):
+            st.session_state.embedding_model = load_embedding_model()
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        with st.container():
+            st.markdown("### 📄 Upload Your Document")
+            st.markdown("*Select a PDF file to start learning*")
+            uploaded_file = st.file_uploader(
+                "Choose a PDF file",
+                type="pdf",
+                help="Upload any PDF document - textbooks, research papers, articles, etc.",
+                label_visibility="collapsed"
+            )
+            # When a new file is uploaded we clear previous DB to avoid accidental cross-document queries
+            if uploaded_file is not None:
+                st.info(f"📎 **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
+                if st.button("🚀 Process Document", use_container_width=True):
+                    # Reset previous DB and state before processing new file
+                    if st.session_state.get('vector_db') is not None:
+                        try:
+                            # best-effort: attempt to delete old collection if name stored
+                            old_name = st.session_state.get('collection_name')
+                            if old_name:
+                                client = chromadb.Client()
+                                try:
+                                    client.delete_collection(old_name)
+                                except Exception:
+                                    # if SDK doesn't support delete or fails, ignore and continue
+                                    pass
+                        except Exception:
+                            pass
+                        st.session_state.vector_db = None
+                        st.session_state.collection_name = None
+                        st.session_state.processed_file = None
+                    with st.spinner("📖 Reading and analyzing your document..."):
+                        pdf_result = extract_text_from_pdf(uploaded_file)
+                        if pdf_result['success']:
+                            st.success(f"✅ Successfully processed **{pdf_result['total_pages']} pages**")
+                            with st.spinner("🔍 Creating searchable chunks..."):
+                                chunks = create_chunks(pdf_result['pages'])
+                                st.info(f"📝 Created **{len(chunks)}** searchable text segments")
+                            # Create vector database using a unique collection name
+                            if st.session_state.embedding_model:
+                                with st.spinner("🧠 Building knowledge base..."):
+                                    collection = create_vector_database(chunks, st.session_state.embedding_model)
+                                    if collection:
+                                        st.session_state.vector_db = collection
+                                        st.success("✅ **Ready to answer your questions!**")
+                                        st.session_state.processed_file = uploaded_file.name
+                                        st.balloons()
+                                    else:
+                                        st.error("❌ Failed to create knowledge base")
+                            else:
+                                st.error("❌ AI model not available")
+                        else:
+                            st.error(f"❌ Failed to process PDF: {pdf_result['error']}")
+    # Question answering section
+    if st.session_state.vector_db is not None:
+        st.markdown("---")
+        st.markdown("### 💬 Ask Your Questions")
+        if st.session_state.processed_file:
+            st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")
+        with st.form(key="question_form"):
+            question = st.text_input(
+                "What would you like to know?",
+                placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",
+                help="Ask any question about the content of your document",
+                label_visibility="collapsed"
+            )
+            submit_button = st.form_submit_button(
+                "🔍 Get Answer",
+                use_container_width=True
+            )
+        if submit_button and question.strip():
+            with st.spinner("🤔 Thinking..."):
+                relevant_chunks = query_vector_database(
+                    st.session_state.vector_db,
+                    question,
+                    st.session_state.embedding_model
+                )
+                if relevant_chunks:
+                    answer = generate_answer(question, relevant_chunks)
+                    st.markdown("#### 🎯 Answer")
+                    st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)
+                    st.markdown("#### 📚 Top Sources")
+                    st.markdown("*Most relevant passages from your document:*")
+                    for i, chunk in enumerate(relevant_chunks, 1):
+                        with st.expander(
+                            f"**Source {i}** | 📄 Page {chunk['page_number']} | "
+                            f"🎯 Relevance: {chunk['similarity']*100:.0f}%"
+                        ):
+                            st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>', unsafe_allow_html=True)
+                else:
+                    st.warning("❌ No relevant information found for your question. Try rephrasing or asking about topics covered in the document.")
+    else:
+        st.markdown("""
+        <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
+            <h3>👋 Welcome to PageMentor!</h3>
+            <p style='color: #666; font-size: 1.1rem;'>Upload a PDF document above to start your learning journey.</p>
+            <p style='color: #999;'>Support for textbooks, research papers, articles, and more!</p>
+        </div>
+        """, unsafe_allow_html=True)
+    # Sidebar with About sections
+    with st.sidebar:
+        st.markdown("### 📱 About This App")
+        st.markdown("""
+        PageMentor is an AI-powered learning assistant that helps you understand any PDF document through intelligent Q&A.
+        **Features:**
+        - 🔍 Smart document analysis
+        - 💡 Instant answers with citations
+        - 📚 Source verification
+        - 🎯 High accuracy responses
+        """)
+        st.markdown("---")
+        st.markdown("### ⚙️ Current Settings")
+        st.markdown(f"""
+        - **Similarity Threshold:** {SIMILARITY_THRESHOLD}
+        - **Retrieved Chunks:** {TOP_K_CHUNKS}
+        - **Chunk Size:** {CHUNK_SIZE} words
+        """)
+        st.markdown("---")
+        st.markdown("### 👨‍💻 About Developer")
+        st.markdown("""
+        **© 2025 Anam Jafar**
+        Connect with me:
+        - 💼 [LinkedIn](https://www.linkedin.com/in/anam-jafar6/)
+        - 🚀 AI/ML Engineer & Developer
+        """)
+    st.markdown("""
+    <div class="footer">
+        <p>Built with ❤️ using Streamlit | Powered by AI | © 2025 PageMentor</p>
+        <p style='font-size: 0.9rem; color: #999;'>Transform any document into your personal tutor</p>
+    </div>
+    """, unsafe_allow_html=True)
+# RUN THE APPLICATION
 if __name__ == "__main__":
     main()