anamjafar6 commited on
Commit
b38cebf
Β·
verified Β·
1 Parent(s): 7357dc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +420 -103
app.py CHANGED
@@ -1,178 +1,495 @@
1
- import streamlit as st
2
- import os
3
- import pypdf
4
- import chromadb
5
- from sentence_transformers import SentenceTransformer
6
- from groq import Groq
7
- from typing import List, Dict, Any, Optional
8
-
9
- # CONFIG
10
- SIMILARITY_THRESHOLD = 0.2
11
- TOP_K_CHUNKS = 3
12
- CHUNK_SIZE = 300
13
- EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
14
-
15
- # PDF extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
 
17
  try:
18
- pdf_reader = pypdf.PdfReader(pdf_file)
19
- pages_text = []
20
- for page_num, page in enumerate(pdf_reader.pages):
21
- page_text = page.extract_text()
22
- if page_text and page_text.strip():
 
23
  pages_text.append({
24
- 'page_number': page_num + 1,
25
- 'text': page_text.strip()
26
  })
27
- return {"success": True, "pages": pages_text, "total_pages": len(pages_text)}
28
- except Exception as e:
29
- return {"success": False, "error": str(e)}
30
 
31
- # Chunking
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def create_chunks(pages_text: List[Dict]) -> List[Dict]:
 
33
  chunks = []
34
  chunk_id = 0
 
35
  for page_data in pages_text:
36
- words = page_data['text'].split()
 
 
 
 
37
  for i in range(0, len(words), CHUNK_SIZE):
38
  chunk_words = words[i:i + CHUNK_SIZE]
39
- if len(chunk_words) > 20:
 
 
40
  chunks.append({
41
- "id": chunk_id,
42
- "text": " ".join(chunk_words),
43
- "page_number": page_data['page_number'],
44
- "word_count": len(chunk_words)
45
  })
46
  chunk_id += 1
 
47
  return chunks
48
 
49
- # Embedding model
 
50
  @st.cache_resource
51
  def load_embedding_model():
52
- return SentenceTransformer(EMBEDDING_MODEL)
 
 
 
 
 
 
 
 
53
 
54
- # Vector database
55
  def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
 
 
 
 
 
 
56
  try:
57
  client = chromadb.Client()
58
- # use get_or_create instead of create
59
- collection = client.get_or_create_collection("pdf_chunks")
60
 
61
- texts = [c['text'] for c in chunks]
 
 
 
 
62
  embeddings = embedding_model.encode(texts).tolist()
63
 
 
64
  collection.add(
65
  embeddings=embeddings,
66
  documents=texts,
67
  metadatas=[{
68
- "page_number": c["page_number"],
69
- "chunk_id": c["id"],
70
- "word_count": c["word_count"]
71
- } for c in chunks],
72
- ids=[str(c["id"]) for c in chunks]
73
  )
 
 
 
74
  return collection
 
75
  except Exception as e:
76
- st.error(f"Vector DB error: {e}")
77
  return None
78
 
 
79
  def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
 
80
  try:
81
- query_emb = embedding_model.encode([query]).tolist()
82
- results = collection.query(query_embeddings=query_emb, n_results=k)
 
 
 
 
83
  relevant_chunks = []
84
- for i in range(len(results['documents'][0])):
85
- distance = results['distances'][0][i]
86
- similarity = max(0, 1 - distance)
 
 
 
 
 
 
 
 
 
 
 
87
  if similarity >= SIMILARITY_THRESHOLD:
88
  relevant_chunks.append({
89
- "text": results['documents'][0][i],
90
- "page_number": results['metadatas'][0][i]["page_number"],
91
- "similarity": similarity,
92
- "chunk_id": results['metadatas'][0][i]["chunk_id"]
93
  })
 
94
  return relevant_chunks
 
95
  except Exception as e:
96
- st.error(f"Query error: {e}")
97
  return []
98
 
99
- # Groq setup
 
100
  def setup_groq():
101
- api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
 
 
 
 
 
 
 
 
 
 
102
  if not api_key:
103
- st.error("❌ No GROQ_API_KEY found. Please add it to secrets or env.")
104
  return None
105
- return Groq(api_key=api_key)
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
 
 
 
 
 
 
108
  try:
109
- context = "\n\n".join([f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks])
110
- prompt = f"""
111
- Based ONLY on the following context from a PDF document, answer the user's question.
112
 
113
- Context:
114
- {context}
115
 
116
- Question: {query}
117
 
118
- Instructions:
119
- - Answer ONLY using info from the context above
120
- - If not enough info, reply: ❌ Insufficient evidence
121
- - Always include page citations like [Page X]
122
- """
123
- chat = client.chat.completions.create(
 
 
 
 
 
 
 
 
 
 
124
  model="llama3-8b-8192",
125
  messages=[
126
- {"role": "system", "content": "You are a helpful tutor AI."},
127
  {"role": "user", "content": prompt}
128
  ],
129
  temperature=0.1,
130
  max_tokens=500
131
  )
132
- return chat.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
133
  except Exception as e:
134
  return f"Error generating answer: {e}"
135
 
136
- # Main answer pipeline
 
137
  def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
 
138
  if not relevant_chunks:
139
  return "❌ Insufficient evidence"
 
140
  client = setup_groq()
141
- if client:
142
- return generate_answer_with_groq(client, query, relevant_chunks)
143
- return "❌ No LLM configured."
 
 
 
144
 
145
- # -----------------------------
146
- # STREAMLIT MAIN
147
- # -----------------------------
148
  def main():
149
- st.set_page_config(page_title="PageMentor", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- st.title("πŸ“š PageMentor")
 
 
 
 
 
152
 
153
- if "vector_db" not in st.session_state:
 
 
 
154
  st.session_state.vector_db = None
155
- st.session_state.embedding_model = load_embedding_model()
156
-
157
- uploaded_file = st.file_uploader("Upload PDF", type="pdf")
158
-
159
- if uploaded_file and st.button("πŸš€ Process PDF"):
160
- pdf_result = extract_text_from_pdf(uploaded_file)
161
- if pdf_result["success"]:
162
- chunks = create_chunks(pdf_result["pages"])
163
- st.session_state.vector_db = create_vector_database(chunks, st.session_state.embedding_model)
164
- if st.session_state.vector_db:
165
- st.success(f"βœ… Processed {pdf_result['total_pages']} pages, {len(chunks)} chunks ready!")
166
- else:
167
- st.error(pdf_result["error"])
168
-
169
- if st.session_state.vector_db:
170
- query = st.text_input("Ask a question:")
171
- if query and st.button("πŸ” Get Answer"):
172
- relevant_chunks = query_vector_database(st.session_state.vector_db, query, st.session_state.embedding_model)
173
- answer = generate_answer(query, relevant_chunks)
174
- st.markdown("### 🎯 Answer")
175
- st.write(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  if __name__ == "__main__":
178
  main()
 
1
+ # PAGEMENTOR - ENHANCED UI/UX RAG STREAMLIT APP
2
+
3
+ # IMPORTS & CONFIGURATION
4
+ import streamlit as st # Main web app framework
5
+ import os # For environment variables
6
+ import pypdf # For PDF text extraction
7
+ import numpy as np # For numerical operations
8
+ import chromadb # Vector database for storing embeddings
9
+ from sentence_transformers import SentenceTransformer # For creating text embeddings
10
+ # Groq client (LLM) - will be used if available
11
+ try:
12
+ from groq import Groq
13
+ except Exception:
14
+ Groq = None
15
+
16
+ from typing import List, Dict, Any, Optional # Type hints for better code clarity
17
+ import re # For text processing
18
+ from uuid import uuid4
19
+ import time
20
+
21
+ # CONFIGURABLE CONSTANTS
22
+ SIMILARITY_THRESHOLD = 0.2 # Slightly lower so relevant chunks are not missed
23
+ TOP_K_CHUNKS = 3 # Number of most relevant chunks to retrieve
24
+ CHUNK_SIZE = 300 # Target number of words per text chunk
25
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Free embedding model
26
+
27
+ # PDF EXTRACTION FUNCTION
28
+
29
  def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
30
+ """Extract text from uploaded PDF file with page numbers."""
31
  try:
32
+ pdf_reader = pypdf.PdfReader(pdf_file) # Create PDF reader object
33
+ pages_text = [] # List to store text from each page
34
+
35
+ for page_num, page in enumerate(pdf_reader.pages): # Loop through each page
36
+ page_text = page.extract_text() or "" # Extract text (may return None)
37
+ if page_text and page_text.strip(): # Only add non-empty pages
38
  pages_text.append({
39
+ 'page_number': page_num + 1, # Page numbers start from 1
40
+ 'text': page_text.strip() # Remove extra whitespace
41
  })
 
 
 
42
 
43
+ return {
44
+ 'success': True,
45
+ 'pages': pages_text,
46
+ 'total_pages': len(pages_text)
47
+ }
48
+
49
+ except Exception as e: # Handle any errors during PDF processing
50
+ return {
51
+ 'success': False,
52
+ 'error': str(e)
53
+ }
54
+
55
+ # CHUNKING FUNCTION
56
+
57
  def create_chunks(pages_text: List[Dict]) -> List[Dict]:
58
+ """Split text into smaller chunks while preserving page information."""
59
  chunks = []
60
  chunk_id = 0
61
+
62
  for page_data in pages_text:
63
+ page_num = page_data['page_number']
64
+ text = page_data['text']
65
+ words = text.split()
66
+
67
+ # Create chunks of approximately CHUNK_SIZE words
68
  for i in range(0, len(words), CHUNK_SIZE):
69
  chunk_words = words[i:i + CHUNK_SIZE]
70
+ chunk_text = ' '.join(chunk_words)
71
+
72
+ if len(chunk_words) > 20: # Only keep substantial chunks (more than 20 words)
73
  chunks.append({
74
+ 'id': chunk_id,
75
+ 'text': chunk_text,
76
+ 'page_number': page_num,
77
+ 'word_count': len(chunk_words)
78
  })
79
  chunk_id += 1
80
+
81
  return chunks
82
 
83
+ # EMBEDDING LOADING FUNCTION
84
+
85
  @st.cache_resource
86
  def load_embedding_model():
87
+ """Load the sentence transformer model for creating embeddings."""
88
+ try:
89
+ model = SentenceTransformer(EMBEDDING_MODEL)
90
+ return model
91
+ except Exception as e:
92
+ st.error(f"Failed to load embedding model: {e}")
93
+ return None
94
+
95
+ # VECTOR DATABASE CREATION & QUERY FUNCTIONS
96
 
 
97
  def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
98
+ """Create ChromaDB vector database with embeddings.
99
+
100
+ FIXES:
101
+ - Use a unique collection name per uploaded file to avoid "already exists" errors.
102
+ - Store collection reference and name in session_state so later queries use the right collection.
103
+ """
104
  try:
105
  client = chromadb.Client()
 
 
106
 
107
+ # create a unique collection name per upload to avoid conflicts
108
+ collection_name = f"pdf_chunks_{uuid4().hex[:8]}"
109
+ collection = client.create_collection(collection_name)
110
+
111
+ texts = [chunk['text'] for chunk in chunks]
112
  embeddings = embedding_model.encode(texts).tolist()
113
 
114
+ # Add chunks to database with embeddings and metadata
115
  collection.add(
116
  embeddings=embeddings,
117
  documents=texts,
118
  metadatas=[{
119
+ 'page_number': chunk['page_number'],
120
+ 'chunk_id': chunk['id'],
121
+ 'word_count': chunk['word_count']
122
+ } for chunk in chunks],
123
+ ids=[str(chunk['id']) for chunk in chunks]
124
  )
125
+
126
+ # store collection name in session state so queries can reference it
127
+ st.session_state.collection_name = collection_name
128
  return collection
129
+
130
  except Exception as e:
131
+ st.error(f"Failed to create vector database: {e}")
132
  return None
133
 
134
+
135
  def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
136
+ """Query the vector database for relevant chunks."""
137
  try:
138
+ query_embedding = embedding_model.encode([query]).tolist()
139
+ results = collection.query(
140
+ query_embeddings=query_embedding,
141
+ n_results=k
142
+ )
143
+
144
  relevant_chunks = []
145
+
146
+ # Chroma returns lists in results; careful with indexing
147
+ docs = results.get('documents', [])
148
+ dists = results.get('distances', [])
149
+ metas = results.get('metadatas', [])
150
+
151
+ if not docs:
152
+ return []
153
+
154
+ for i in range(len(docs[0])):
155
+ distance = dists[0][i] if dists else 0
156
+ # Convert distance to similarity (works if distances in [0,1])
157
+ similarity = max(0, 1 - distance) if isinstance(distance, (int, float)) else 0
158
+
159
  if similarity >= SIMILARITY_THRESHOLD:
160
  relevant_chunks.append({
161
+ 'text': docs[0][i],
162
+ 'page_number': metas[0][i].get('page_number') if metas else None,
163
+ 'similarity': similarity,
164
+ 'chunk_id': metas[0][i].get('chunk_id') if metas else None
165
  })
166
+
167
  return relevant_chunks
168
+
169
  except Exception as e:
170
+ st.error(f"Failed to query database: {e}")
171
  return []
172
 
173
+ # LLM WRAPPER FOR GROQ
174
+
175
  def setup_groq():
176
+ """Configure Groq client using GROQ_API_KEY from secrets or env."""
177
+ api_key = None
178
+ # Hugging Face / Streamlit secrets: try st.secrets first (HF sets as env, but we'll check both)
179
+ try:
180
+ api_key = st.secrets.get('GROQ_API_KEY') # type: ignore
181
+ except Exception:
182
+ api_key = None
183
+
184
+ if not api_key:
185
+ api_key = os.getenv('GROQ_API_KEY')
186
+
187
  if not api_key:
188
+ st.error("❌ GROQ_API_KEY not found. Please add it to Hugging Face secrets or environment variables.")
189
  return None
190
+
191
+ if Groq is None:
192
+ st.error("❌ groq package not installed or failed to import. Add 'groq' to requirements.txt")
193
+ return None
194
+
195
+ try:
196
+ client = Groq(api_key=api_key)
197
+ return client
198
+ except Exception as e:
199
+ st.error(f"Failed to initialize Groq client: {e}")
200
+ return None
201
+
202
 
203
  def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
204
+ """Generate answer using Groq (chat/completions). Keep prompt strict to only use context.
205
+
206
+ NOTE: Groq client libraries and method names can change. This implementation uses a generic
207
+ chat completions call pattern; when deploying, if Groq client has different API you may need
208
+ to adjust the call accordingly. We surface clear error messages to help debugging.
209
+ """
210
  try:
211
+ # Build strict context with page citations
212
+ context_parts = [f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks]
213
+ context = "
214
 
215
+ ".join(context_parts)
 
216
 
217
+ prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question.
218
 
219
+ Context:
220
+ {context}
221
+
222
+ Question: {query}
223
+
224
+ Instructions:
225
+ - Answer using ONLY the information provided in the context above
226
+ - If the context does not contain enough information to answer the question, reply exactly: ❌ Insufficient evidence
227
+ - Always include page citations in your answer using the format [Page X]
228
+ - Be accurate and concise
229
+ - Do not add information not present in the context
230
+
231
+ Answer:"""
232
+
233
+ # Example chat-style call β€” adjust if Groq client exposes a different interface
234
+ chat_resp = client.chat.completions.create(
235
  model="llama3-8b-8192",
236
  messages=[
237
+ {"role": "system", "content": "You are a strict assistant that only uses provided context."},
238
  {"role": "user", "content": prompt}
239
  ],
240
  temperature=0.1,
241
  max_tokens=500
242
  )
243
+
244
+ # Parse response depending on returned structure
245
+ if hasattr(chat_resp, 'choices'):
246
+ # SDK-style response
247
+ return chat_resp.choices[0].message.content
248
+ elif isinstance(chat_resp, dict):
249
+ # dict-style response
250
+ choices = chat_resp.get('choices') or []
251
+ if choices:
252
+ # try common paths
253
+ return choices[0].get('message', {}).get('content') or choices[0].get('text') or str(choices[0])
254
+ return str(chat_resp)
255
+
256
  except Exception as e:
257
  return f"Error generating answer: {e}"
258
 
259
+ # ANSWER GENERATION FUNCTION
260
+
261
  def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
262
+ """Main function to generate answers using Groq; fallback to safe messages."""
263
  if not relevant_chunks:
264
  return "❌ Insufficient evidence"
265
+
266
  client = setup_groq()
267
+ if not client:
268
+ return "❌ No LLM configured. Please add GROQ_API_KEY to your secrets."
269
+
270
+ return generate_answer_with_groq(client, query, relevant_chunks)
271
+
272
+ # STREAMLIT UI
273
 
 
 
 
274
  def main():
275
+ """Main Streamlit application."""
276
+
277
+ # Page configuration with wide layout for centered design
278
+ st.set_page_config(
279
+ page_title="PageMentor",
280
+ page_icon="πŸ“š",
281
+ layout="wide"
282
+ )
283
+
284
+ # Custom CSS (kept exactly as your original UI)
285
+ st.markdown("""
286
+ <style>
287
+ /* Center the main container with max width */
288
+ .main > div {
289
+ max-width: 900px;
290
+ margin: 0 auto;
291
+ padding: 2rem 1rem;
292
+ }
293
+ .stApp { background-color: #f8f9fa; }
294
+ .header-container { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
295
+ .header-title { color: white; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
296
+ .header-subtitle { color: rgba(255,255,255,0.9); font-size: 1.1rem; }
297
+ .answer-box { background-color: white; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 8px rgba(0,0,0,0.08); border-left: 4px solid #667eea; }
298
+ .source-card { background-color: #f0f2f6; border-radius: 10px; padding: 1rem; margin: 0.5rem 0; border-left: 3px solid #764ba2; }
299
+ .stButton > button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; border-radius: 8px; padding: 0.5rem 2rem; font-weight: 600; }
300
+ .stTextInput > div > div > input { border-radius: 8px; border: 2px solid #e0e0e0; padding: 0.75rem; }
301
+ .stTextInput > div > div > input:focus { border-color: #667eea; box-shadow: 0 0 0 2px rgba(102,126,234,0.1); }
302
+ .footer { text-align: center; padding: 2rem 0; margin-top: 3rem; border-top: 1px solid #e0e0e0; color: #666; }
303
+ </style>
304
+ """, unsafe_allow_html=True)
305
 
306
+ st.markdown("""
307
+ <div class="header-container">
308
+ <div class="header-title">πŸ“š PageMentor</div>
309
+ <div class="header-subtitle">Book-based AI Tutor - Learn from any PDF document</div>
310
+ </div>
311
+ """, unsafe_allow_html=True)
312
 
313
+ st.markdown("---")
314
+
315
+ # Initialize session state for storing data
316
+ if 'vector_db' not in st.session_state:
317
  st.session_state.vector_db = None
318
+ if 'embedding_model' not in st.session_state:
319
+ st.session_state.embedding_model = None
320
+ if 'processed_file' not in st.session_state:
321
+ st.session_state.processed_file = None
322
+ if 'collection_name' not in st.session_state:
323
+ st.session_state.collection_name = None
324
+
325
+ # Load embedding model
326
+ if st.session_state.embedding_model is None:
327
+ with st.spinner("πŸ”„ Loading AI models..."):
328
+ st.session_state.embedding_model = load_embedding_model()
329
+
330
+ col1, col2 = st.columns([2, 1])
331
+
332
+ with col1:
333
+ with st.container():
334
+ st.markdown("### πŸ“„ Upload Your Document")
335
+ st.markdown("*Select a PDF file to start learning*")
336
+
337
+ uploaded_file = st.file_uploader(
338
+ "Choose a PDF file",
339
+ type="pdf",
340
+ help="Upload any PDF document - textbooks, research papers, articles, etc.",
341
+ label_visibility="collapsed"
342
+ )
343
+
344
+ # When a new file is uploaded we clear previous DB to avoid accidental cross-document queries
345
+ if uploaded_file is not None:
346
+ st.info(f"πŸ“Ž **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
347
+
348
+ if st.button("πŸš€ Process Document", use_container_width=True):
349
+ # Reset previous DB and state before processing new file
350
+ if st.session_state.get('vector_db') is not None:
351
+ try:
352
+ # best-effort: attempt to delete old collection if name stored
353
+ old_name = st.session_state.get('collection_name')
354
+ if old_name:
355
+ client = chromadb.Client()
356
+ try:
357
+ client.delete_collection(old_name)
358
+ except Exception:
359
+ # if SDK doesn't support delete or fails, ignore and continue
360
+ pass
361
+ except Exception:
362
+ pass
363
+
364
+ st.session_state.vector_db = None
365
+ st.session_state.collection_name = None
366
+ st.session_state.processed_file = None
367
+
368
+ with st.spinner("πŸ“– Reading and analyzing your document..."):
369
+ pdf_result = extract_text_from_pdf(uploaded_file)
370
+
371
+ if pdf_result['success']:
372
+ st.success(f"βœ… Successfully processed **{pdf_result['total_pages']} pages**")
373
+
374
+ with st.spinner("πŸ” Creating searchable chunks..."):
375
+ chunks = create_chunks(pdf_result['pages'])
376
+ st.info(f"πŸ“ Created **{len(chunks)}** searchable text segments")
377
+
378
+ # Create vector database using a unique collection name
379
+ if st.session_state.embedding_model:
380
+ with st.spinner("🧠 Building knowledge base..."):
381
+ collection = create_vector_database(chunks, st.session_state.embedding_model)
382
+ if collection:
383
+ st.session_state.vector_db = collection
384
+ st.success("βœ… **Ready to answer your questions!**")
385
+ st.session_state.processed_file = uploaded_file.name
386
+ st.balloons()
387
+ else:
388
+ st.error("❌ Failed to create knowledge base")
389
+ else:
390
+ st.error("❌ AI model not available")
391
+
392
+ else:
393
+ st.error(f"❌ Failed to process PDF: {pdf_result['error']}")
394
+
395
+ # Question answering section
396
+ if st.session_state.vector_db is not None:
397
+ st.markdown("---")
398
+ st.markdown("### πŸ’¬ Ask Your Questions")
399
+
400
+ if st.session_state.processed_file:
401
+ st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")
402
+
403
+ with st.form(key="question_form"):
404
+ question = st.text_input(
405
+ "What would you like to know?",
406
+ placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",
407
+ help="Ask any question about the content of your document",
408
+ label_visibility="collapsed"
409
+ )
410
+
411
+ submit_button = st.form_submit_button(
412
+ "πŸ” Get Answer",
413
+ use_container_width=True
414
+ )
415
+
416
+ if submit_button and question.strip():
417
+ with st.spinner("πŸ€” Thinking..."):
418
+ relevant_chunks = query_vector_database(
419
+ st.session_state.vector_db,
420
+ question,
421
+ st.session_state.embedding_model
422
+ )
423
+
424
+ if relevant_chunks:
425
+ answer = generate_answer(question, relevant_chunks)
426
+
427
+ st.markdown("#### 🎯 Answer")
428
+ st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)
429
+
430
+ st.markdown("#### πŸ“š Top Sources")
431
+ st.markdown("*Most relevant passages from your document:*")
432
+
433
+ for i, chunk in enumerate(relevant_chunks, 1):
434
+ with st.expander(
435
+ f"**Source {i}** | πŸ“„ Page {chunk['page_number']} | "
436
+ f"🎯 Relevance: {chunk['similarity']*100:.0f}%"
437
+ ):
438
+ st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>', unsafe_allow_html=True)
439
+
440
+ else:
441
+ st.warning("❌ No relevant information found for your question. Try rephrasing or asking about topics covered in the document.")
442
+
443
+ else:
444
+ st.markdown("""
445
+ <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
446
+ <h3>πŸ‘‹ Welcome to PageMentor!</h3>
447
+ <p style='color: #666; font-size: 1.1rem;'>Upload a PDF document above to start your learning journey.</p>
448
+ <p style='color: #999;'>Support for textbooks, research papers, articles, and more!</p>
449
+ </div>
450
+ """, unsafe_allow_html=True)
451
+
452
+ # Sidebar with About sections
453
+ with st.sidebar:
454
+ st.markdown("### πŸ“± About This App")
455
+ st.markdown("""
456
+ PageMentor is an AI-powered learning assistant that helps you understand any PDF document through intelligent Q&A.
457
+
458
+ **Features:**
459
+ - πŸ” Smart document analysis
460
+ - πŸ’‘ Instant answers with citations
461
+ - πŸ“š Source verification
462
+ - 🎯 High accuracy responses
463
+ """)
464
+
465
+ st.markdown("---")
466
+
467
+ st.markdown("### βš™οΈ Current Settings")
468
+ st.markdown(f"""
469
+ - **Similarity Threshold:** {SIMILARITY_THRESHOLD}
470
+ - **Retrieved Chunks:** {TOP_K_CHUNKS}
471
+ - **Chunk Size:** {CHUNK_SIZE} words
472
+ """)
473
+
474
+ st.markdown("---")
475
+
476
+ st.markdown("### πŸ‘¨β€πŸ’» About Developer")
477
+ st.markdown("""
478
+ **Β© 2025 Anam Jafar**
479
+
480
+ Connect with me:
481
+ - πŸ’Ό [LinkedIn](https://www.linkedin.com/in/anam-jafar6/)
482
+ - πŸš€ AI/ML Engineer & Developer
483
+ """)
484
+
485
+ st.markdown("""
486
+ <div class="footer">
487
+ <p>Built with ❀️ using Streamlit | Powered by AI | © 2025 PageMentor</p>
488
+ <p style='font-size: 0.9rem; color: #999;'>Transform any document into your personal tutor</p>
489
+ </div>
490
+ """, unsafe_allow_html=True)
491
+
492
+ # RUN THE APPLICATION
493
 
494
  if __name__ == "__main__":
495
  main()