anamjafar6 commited on
Commit
cb18ce7
·
verified ·
1 Parent(s): 5b5b3f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -411
app.py CHANGED
@@ -1,218 +1,185 @@
1
- # PAGEMENTOR - ENHANCED UI/UX RAG STREAMLIT APP
2
-
3
- # IMPORTS & CONFIGURATION
4
- import streamlit as st # Main web app framework
5
- import os # For environment variables
6
- import pypdf # For PDF text extraction
7
- import numpy as np # For numerical operations
8
- import chromadb # Vector database for storing embeddings
9
- from sentence_transformers import SentenceTransformer # For creating text embeddings
10
- # Groq client (LLM) - will be used if available
11
  try:
12
  from groq import Groq
13
- except Exception:
14
  Groq = None
15
 
16
- from typing import List, Dict, Any, Optional # Type hints for better code clarity
17
- import re # For text processing
18
- from uuid import uuid4
19
- import time
20
 
21
- # CONFIGURABLE CONSTANTS
22
- SIMILARITY_THRESHOLD = 0.2 # Slightly lower so relevant chunks are not missed
23
- TOP_K_CHUNKS = 3 # Number of most relevant chunks to retrieve
24
- CHUNK_SIZE = 300 # Target number of words per text chunk
25
- EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Free embedding model
 
 
 
 
 
 
 
 
26
 
27
- # PDF EXTRACTION FUNCTION
28
 
29
- def extract_text_from_pdf(pdf_file) -> Dict[str, Any]:
30
- """Extract text from uploaded PDF file with page numbers."""
 
 
 
 
 
 
 
31
  try:
32
- pdf_reader = pypdf.PdfReader(pdf_file) # Create PDF reader object
33
- pages_text = [] # List to store text from each page
34
-
35
- for page_num, page in enumerate(pdf_reader.pages): # Loop through each page
36
- page_text = page.extract_text() or "" # Extract text (may return None)
37
- if page_text and page_text.strip(): # Only add non-empty pages
38
- pages_text.append({
39
- 'page_number': page_num + 1, # Page numbers start from 1
40
- 'text': page_text.strip() # Remove extra whitespace
41
- })
42
 
43
- return {
44
- 'success': True,
45
- 'pages': pages_text,
46
- 'total_pages': len(pages_text)
47
- }
48
 
49
- except Exception as e: # Handle any errors during PDF processing
50
- return {
51
- 'success': False,
52
- 'error': str(e)
53
- }
54
 
55
- # CHUNKING FUNCTION
 
 
 
 
 
 
56
 
57
- def create_chunks(pages_text: List[Dict]) -> List[Dict]:
58
- """Split text into smaller chunks while preserving page information."""
59
  chunks = []
60
- chunk_id = 0
 
 
 
 
 
 
61
 
62
- for page_data in pages_text:
63
- page_num = page_data['page_number']
64
- text = page_data['text']
65
  words = text.split()
66
-
67
- # Create chunks of approximately CHUNK_SIZE words
68
- for i in range(0, len(words), CHUNK_SIZE):
69
- chunk_words = words[i:i + CHUNK_SIZE]
70
- chunk_text = ' '.join(chunk_words)
71
-
72
- if len(chunk_words) > 20: # Only keep substantial chunks (more than 20 words)
73
  chunks.append({
74
- 'id': chunk_id,
75
- 'text': chunk_text,
76
- 'page_number': page_num,
77
- 'word_count': len(chunk_words)
78
  })
79
- chunk_id += 1
80
-
81
  return chunks
82
 
83
- # EMBEDDING LOADING FUNCTION
84
 
85
- @st.cache_resource
86
- def load_embedding_model():
87
- """Load the sentence transformer model for creating embeddings."""
88
- try:
89
- model = SentenceTransformer(EMBEDDING_MODEL)
90
- return model
91
- except Exception as e:
92
- st.error(f"Failed to load embedding model: {e}")
93
  return None
94
 
95
- # VECTOR DATABASE CREATION & QUERY FUNCTIONS
96
-
97
- def create_vector_database(chunks: List[Dict], embedding_model) -> Optional[Any]:
98
- """Create ChromaDB vector database with embeddings.
99
 
100
- FIXES:
101
- - Use a unique collection name per uploaded file to avoid "already exists" errors.
102
- - Store collection reference and name in session_state so later queries use the right collection.
103
- """
104
  try:
105
- client = chromadb.Client()
106
-
107
- # create a unique collection name per upload to avoid conflicts
108
- collection_name = f"pdf_chunks_{uuid4().hex[:8]}"
109
  collection = client.create_collection(collection_name)
 
 
 
 
 
 
110
 
111
- texts = [chunk['text'] for chunk in chunks]
112
- embeddings = embedding_model.encode(texts).tolist()
 
 
 
 
 
113
 
114
- # Add chunks to database with embeddings and metadata
115
  collection.add(
116
  embeddings=embeddings,
117
  documents=texts,
118
- metadatas=[{
119
- 'page_number': chunk['page_number'],
120
- 'chunk_id': chunk['id'],
121
- 'word_count': chunk['word_count']
122
- } for chunk in chunks],
123
- ids=[str(chunk['id']) for chunk in chunks]
124
  )
125
-
126
- # store collection name in session state so queries can reference it
127
- st.session_state.collection_name = collection_name
128
- return collection
129
-
130
  except Exception as e:
131
- st.error(f"Failed to create vector database: {e}")
132
  return None
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- def query_vector_database(collection, query: str, embedding_model, k: int = TOP_K_CHUNKS) -> List[Dict]:
136
- """Query the vector database for relevant chunks."""
137
  try:
138
  query_embedding = embedding_model.encode([query]).tolist()
 
 
 
 
 
139
  results = collection.query(
140
  query_embeddings=query_embedding,
141
- n_results=k
142
  )
143
-
144
- relevant_chunks = []
145
-
146
- # Chroma returns lists in results; careful with indexing
147
- docs = results.get('documents', [])
148
- dists = results.get('distances', [])
149
- metas = results.get('metadatas', [])
150
-
151
- if not docs:
152
- return []
153
-
154
- for i in range(len(docs[0])):
155
- distance = dists[0][i] if dists else 0
156
- # Convert distance to similarity (works if distances in [0,1])
157
- similarity = max(0, 1 - distance) if isinstance(distance, (int, float)) else 0
158
-
159
- if similarity >= SIMILARITY_THRESHOLD:
160
- relevant_chunks.append({
161
- 'text': docs[0][i],
162
- 'page_number': metas[0][i].get('page_number') if metas else None,
163
- 'similarity': similarity,
164
- 'chunk_id': metas[0][i].get('chunk_id') if metas else None
165
- })
166
-
167
- return relevant_chunks
168
-
169
  except Exception as e:
170
- st.error(f"Failed to query database: {e}")
171
  return []
172
 
173
- # LLM WRAPPER FOR GROQ
 
 
174
 
175
- def setup_groq():
176
- """Configure Groq client using GROQ_API_KEY from secrets or env."""
177
- api_key = None
178
- # Hugging Face / Streamlit secrets: try st.secrets first (HF sets as env, but we'll check both)
179
- try:
180
- api_key = st.secrets.get('GROQ_API_KEY') # type: ignore
181
- except Exception:
182
- api_key = None
183
 
184
- if not api_key:
185
- api_key = os.getenv('GROQ_API_KEY')
 
 
 
 
186
 
187
- if not api_key:
188
- st.error(" GROQ_API_KEY not found. Please add it to Hugging Face secrets or environment variables.")
189
- return None
190
-
191
- if Groq is None:
192
- st.error("❌ groq package not installed or failed to import. Add 'groq' to requirements.txt")
193
- return None
194
 
195
- try:
196
- client = Groq(api_key=api_key)
197
- return client
198
- except Exception as e:
199
- st.error(f"Failed to initialize Groq client: {e}")
200
- return None
201
 
202
 
203
  def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
204
- """Generate answer using Groq (chat/completions). Keep prompt strict to only use context.
205
-
206
- NOTE: Groq client libraries and method names can change. This implementation uses a generic
207
- chat completions call pattern; when deploying, if Groq client has different API you may need
208
- to adjust the call accordingly. We surface clear error messages to help debugging.
209
- """
210
  try:
211
- # Build strict context with page citations
212
  context_parts = [f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks]
213
- context = ""
214
-
215
- .join(context_parts)
216
 
217
  prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question.
218
 
@@ -230,266 +197,85 @@ Instructions:
230
 
231
  Answer:"""
232
 
233
- # Example chat-style call — adjust if Groq client exposes a different interface
234
- chat_resp = client.chat.completions.create(
235
- model="llama3-8b-8192",
236
- messages=[
237
- {"role": "system", "content": "You are a strict assistant that only uses provided context."},
238
- {"role": "user", "content": prompt}
239
- ],
240
- temperature=0.1,
241
- max_tokens=500
242
- )
 
 
243
 
244
- # Parse response depending on returned structure
245
- if hasattr(chat_resp, 'choices'):
246
- # SDK-style response
247
  return chat_resp.choices[0].message.content
248
  elif isinstance(chat_resp, dict):
249
- # dict-style response
250
- choices = chat_resp.get('choices') or []
251
  if choices:
252
- # try common paths
253
- return choices[0].get('message', {}).get('content') or choices[0].get('text') or str(choices[0])
 
254
  return str(chat_resp)
255
 
256
  except Exception as e:
257
  return f"Error generating answer: {e}"
258
 
259
- # ANSWER GENERATION FUNCTION
260
-
261
- def generate_answer(query: str, relevant_chunks: List[Dict]) -> str:
262
- """Main function to generate answers using Groq; fallback to safe messages."""
263
- if not relevant_chunks:
264
- return "❌ Insufficient evidence"
265
-
266
- client = setup_groq()
267
- if not client:
268
- return "❌ No LLM configured. Please add GROQ_API_KEY to your secrets."
269
-
270
- return generate_answer_with_groq(client, query, relevant_chunks)
271
-
272
- # STREAMLIT UI
273
 
 
 
 
274
  def main():
275
- """Main Streamlit application."""
276
-
277
- # Page configuration with wide layout for centered design
278
- st.set_page_config(
279
- page_title="PageMentor",
280
- page_icon="📚",
281
- layout="wide"
282
- )
283
-
284
- # Custom CSS (kept exactly as your original UI)
285
- st.markdown("""
286
- <style>
287
- /* Center the main container with max width */
288
- .main > div {
289
- max-width: 900px;
290
- margin: 0 auto;
291
- padding: 2rem 1rem;
292
- }
293
- .stApp { background-color: #f8f9fa; }
294
- .header-container { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
295
- .header-title { color: white; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
296
- .header-subtitle { color: rgba(255,255,255,0.9); font-size: 1.1rem; }
297
- .answer-box { background-color: white; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 8px rgba(0,0,0,0.08); border-left: 4px solid #667eea; }
298
- .source-card { background-color: #f0f2f6; border-radius: 10px; padding: 1rem; margin: 0.5rem 0; border-left: 3px solid #764ba2; }
299
- .stButton > button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; border-radius: 8px; padding: 0.5rem 2rem; font-weight: 600; }
300
- .stTextInput > div > div > input { border-radius: 8px; border: 2px solid #e0e0e0; padding: 0.75rem; }
301
- .stTextInput > div > div > input:focus { border-color: #667eea; box-shadow: 0 0 0 2px rgba(102,126,234,0.1); }
302
- .footer { text-align: center; padding: 2rem 0; margin-top: 3rem; border-top: 1px solid #e0e0e0; color: #666; }
303
- </style>
304
- """, unsafe_allow_html=True)
305
-
306
- st.markdown("""
307
- <div class="header-container">
308
- <div class="header-title">📚 PageMentor</div>
309
- <div class="header-subtitle">Book-based AI Tutor - Learn from any PDF document</div>
310
- </div>
311
- """, unsafe_allow_html=True)
312
-
313
- st.markdown("---")
314
-
315
- # Initialize session state for storing data
316
- if 'vector_db' not in st.session_state:
317
- st.session_state.vector_db = None
318
- if 'embedding_model' not in st.session_state:
319
- st.session_state.embedding_model = None
320
- if 'processed_file' not in st.session_state:
321
- st.session_state.processed_file = None
322
- if 'collection_name' not in st.session_state:
323
- st.session_state.collection_name = None
324
-
325
- # Load embedding model
326
- if st.session_state.embedding_model is None:
327
- with st.spinner("🔄 Loading AI models..."):
328
- st.session_state.embedding_model = load_embedding_model()
329
-
330
- col1, col2 = st.columns([2, 1])
331
-
332
- with col1:
333
- with st.container():
334
- st.markdown("### 📄 Upload Your Document")
335
- st.markdown("*Select a PDF file to start learning*")
336
-
337
- uploaded_file = st.file_uploader(
338
- "Choose a PDF file",
339
- type="pdf",
340
- help="Upload any PDF document - textbooks, research papers, articles, etc.",
341
- label_visibility="collapsed"
342
- )
343
-
344
- # When a new file is uploaded we clear previous DB to avoid accidental cross-document queries
345
- if uploaded_file is not None:
346
- st.info(f"📎 **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
347
-
348
- if st.button("🚀 Process Document", use_container_width=True):
349
- # Reset previous DB and state before processing new file
350
- if st.session_state.get('vector_db') is not None:
351
- try:
352
- # best-effort: attempt to delete old collection if name stored
353
- old_name = st.session_state.get('collection_name')
354
- if old_name:
355
- client = chromadb.Client()
356
- try:
357
- client.delete_collection(old_name)
358
- except Exception:
359
- # if SDK doesn't support delete or fails, ignore and continue
360
- pass
361
- except Exception:
362
- pass
363
-
364
- st.session_state.vector_db = None
365
- st.session_state.collection_name = None
366
- st.session_state.processed_file = None
367
-
368
- with st.spinner("📖 Reading and analyzing your document..."):
369
- pdf_result = extract_text_from_pdf(uploaded_file)
370
-
371
- if pdf_result['success']:
372
- st.success(f"✅ Successfully processed **{pdf_result['total_pages']} pages**")
373
-
374
- with st.spinner("🔍 Creating searchable chunks..."):
375
- chunks = create_chunks(pdf_result['pages'])
376
- st.info(f"📝 Created **{len(chunks)}** searchable text segments")
377
-
378
- # Create vector database using a unique collection name
379
- if st.session_state.embedding_model:
380
- with st.spinner("🧠 Building knowledge base..."):
381
- collection = create_vector_database(chunks, st.session_state.embedding_model)
382
- if collection:
383
- st.session_state.vector_db = collection
384
- st.success("✅ **Ready to answer your questions!**")
385
- st.session_state.processed_file = uploaded_file.name
386
- st.balloons()
387
- else:
388
- st.error("❌ Failed to create knowledge base")
389
- else:
390
- st.error("❌ AI model not available")
391
-
392
- else:
393
- st.error(f"❌ Failed to process PDF: {pdf_result['error']}")
394
-
395
- # Question answering section
396
- if st.session_state.vector_db is not None:
397
- st.markdown("---")
398
- st.markdown("### 💬 Ask Your Questions")
399
-
400
- if st.session_state.processed_file:
401
- st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")
402
-
403
- with st.form(key="question_form"):
404
- question = st.text_input(
405
- "What would you like to know?",
406
- placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",
407
- help="Ask any question about the content of your document",
408
- label_visibility="collapsed"
409
- )
410
-
411
- submit_button = st.form_submit_button(
412
- "🔍 Get Answer",
413
- use_container_width=True
414
- )
415
 
416
- if submit_button and question.strip():
417
- with st.spinner("🤔 Thinking..."):
418
- relevant_chunks = query_vector_database(
419
- st.session_state.vector_db,
420
- question,
421
- st.session_state.embedding_model
422
- )
423
-
424
- if relevant_chunks:
425
- answer = generate_answer(question, relevant_chunks)
426
-
427
- st.markdown("#### 🎯 Answer")
428
- st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)
429
-
430
- st.markdown("#### 📚 Top Sources")
431
- st.markdown("*Most relevant passages from your document:*")
432
-
433
- for i, chunk in enumerate(relevant_chunks, 1):
434
- with st.expander(
435
- f"**Source {i}** | 📄 Page {chunk['page_number']} | "
436
- f"���� Relevance: {chunk['similarity']*100:.0f}%"
437
- ):
438
- st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>', unsafe_allow_html=True)
439
-
440
- else:
441
- st.warning("❌ No relevant information found for your question. Try rephrasing or asking about topics covered in the document.")
442
-
443
- else:
444
- st.markdown("""
445
- <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
446
- <h3>👋 Welcome to PageMentor!</h3>
447
- <p style='color: #666; font-size: 1.1rem;'>Upload a PDF document above to start your learning journey.</p>
448
- <p style='color: #999;'>Support for textbooks, research papers, articles, and more!</p>
449
- </div>
450
- """, unsafe_allow_html=True)
451
-
452
- # Sidebar with About sections
453
- with st.sidebar:
454
- st.markdown("### 📱 About This App")
455
- st.markdown("""
456
- PageMentor is an AI-powered learning assistant that helps you understand any PDF document through intelligent Q&A.
457
-
458
- **Features:**
459
- - 🔍 Smart document analysis
460
- - 💡 Instant answers with citations
461
- - 📚 Source verification
462
- - 🎯 High accuracy responses
463
- """)
464
-
465
- st.markdown("---")
466
-
467
- st.markdown("### ⚙️ Current Settings")
468
- st.markdown(f"""
469
- - **Similarity Threshold:** {SIMILARITY_THRESHOLD}
470
- - **Retrieved Chunks:** {TOP_K_CHUNKS}
471
- - **Chunk Size:** {CHUNK_SIZE} words
472
- """)
473
-
474
- st.markdown("---")
475
-
476
- st.markdown("### 👨‍💻 About Developer")
477
- st.markdown("""
478
- **© 2025 Anam Jafar**
479
-
480
- Connect with me:
481
- - 💼 [LinkedIn](https://www.linkedin.com/in/anam-jafar6/)
482
- - 🚀 AI/ML Engineer & Developer
483
- """)
484
-
485
- st.markdown("""
486
- <div class="footer">
487
- <p>Built with ❤️ using Streamlit | Powered by AI | © 2025 PageMentor</p>
488
- <p style='font-size: 0.9rem; color: #999;'>Transform any document into your personal tutor</p>
489
- </div>
490
- """, unsafe_allow_html=True)
491
-
492
- # RUN THE APPLICATION
493
 
494
  if __name__ == "__main__":
495
  main()
 
1
+ import os
2
+ import streamlit as st
3
+ import numpy as np
4
+ from pypdf import PdfReader
5
+ from typing import List, Dict
6
+ from sentence_transformers import SentenceTransformer
7
+ import chromadb
8
+
9
+ # Try importing Groq client
 
10
  try:
11
  from groq import Groq
12
+ except ImportError:
13
  Groq = None
14
 
 
 
 
 
15
 
16
+ # -----------------------------
17
+ # Utility Functions
18
+ # -----------------------------
19
+ def load_api_key() -> str:
20
+ """Load the GROQ API key from Hugging Face secrets or env vars."""
21
+ api_key = os.environ.get("GROQ_API_KEY")
22
+ if not api_key:
23
+ try:
24
+ from huggingface_hub import HfFolder
25
+ api_key = HfFolder.get_token()
26
+ except Exception:
27
+ pass
28
+ return api_key
29
 
 
30
 
31
+ def setup_groq() -> Groq:
32
+ """Initialize Groq client with API key."""
33
+ api_key = load_api_key()
34
+ if not api_key:
35
+ st.error("❌ Missing GROQ_API_KEY in environment or Hugging Face secrets.")
36
+ return None
37
+ if Groq is None:
38
+ st.error("❌ Groq library not installed. Please add `groq` to requirements.txt.")
39
+ return None
40
  try:
41
+ client = Groq(api_key=api_key)
42
+ return client
43
+ except Exception as e:
44
+ st.error(f"Failed to initialize Groq client: {e}")
45
+ return None
46
+
 
 
 
 
47
 
48
+ @st.cache_resource
49
+ def load_embedding_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer:
50
+ """Load and cache the embedding model."""
51
+ return SentenceTransformer(model_name)
 
52
 
 
 
 
 
 
53
 
54
+ def pdf_to_chunks(uploaded_file, chunk_size: int = 500, overlap: int = 50) -> List[Dict]:
55
+ """Convert PDF to overlapping text chunks."""
56
+ try:
57
+ reader = PdfReader(uploaded_file)
58
+ except Exception as e:
59
+ st.error(f"Error reading PDF: {e}")
60
+ return []
61
 
 
 
62
  chunks = []
63
+ for page_num, page in enumerate(reader.pages, start=1):
64
+ try:
65
+ text = page.extract_text() or ""
66
+ except Exception:
67
+ text = ""
68
+ if not text.strip():
69
+ continue
70
 
 
 
 
71
  words = text.split()
72
+ for i in range(0, len(words), chunk_size - overlap):
73
+ chunk_text = " ".join(words[i:i + chunk_size])
74
+ if chunk_text.strip():
 
 
 
 
75
  chunks.append({
76
+ "page_number": page_num,
77
+ "text": chunk_text
 
 
78
  })
 
 
79
  return chunks
80
 
 
81
 
82
+ def create_vector_database(chunks: List[Dict], embedding_model: SentenceTransformer) -> str:
83
+ """Create a new ChromaDB collection with embeddings and return its name."""
84
+ if not chunks:
85
+ st.error("No text chunks extracted from PDF.")
 
 
 
 
86
  return None
87
 
88
+ client = chromadb.Client()
89
+ collection_name = f"pdf_chunks_{np.random.randint(10000)}"
 
 
90
 
 
 
 
 
91
  try:
 
 
 
 
92
  collection = client.create_collection(collection_name)
93
+ except Exception as e:
94
+ st.error(f"Error creating collection: {e}")
95
+ return None
96
+
97
+ texts = [c["text"] for c in chunks]
98
+ ids = [str(i) for i in range(len(chunks))]
99
 
100
+ # Encode in batches for safety
101
+ embeddings = []
102
+ batch_size = 64
103
+ for i in range(0, len(texts), batch_size):
104
+ batch = texts[i:i + batch_size]
105
+ emb = embedding_model.encode(batch)
106
+ embeddings.extend(emb.tolist() if hasattr(emb, 'tolist') else list(map(list, emb)))
107
 
108
+ try:
109
  collection.add(
110
  embeddings=embeddings,
111
  documents=texts,
112
+ ids=ids,
113
+ metadatas=chunks
 
 
 
 
114
  )
 
 
 
 
 
115
  except Exception as e:
116
+ st.error(f"Error adding embeddings: {e}")
117
  return None
118
 
119
+ # Store only the collection name (not object) in session_state
120
+ st.session_state.collection_name = collection_name
121
+ return collection_name
122
+
123
+
124
+ def query_vector_database(query: str, embedding_model: SentenceTransformer,
125
+ top_k: int = 5) -> List[Dict]:
126
+ """Query ChromaDB for relevant chunks."""
127
+ if "collection_name" not in st.session_state:
128
+ st.error("No active collection found. Upload and process a PDF first.")
129
+ return []
130
+
131
+ try:
132
+ client = chromadb.Client()
133
+ collection = client.get_collection(st.session_state.collection_name)
134
+ except Exception as e:
135
+ st.error(f"Error accessing collection: {e}")
136
+ return []
137
 
 
 
138
  try:
139
  query_embedding = embedding_model.encode([query]).tolist()
140
+ except Exception as e:
141
+ st.error(f"Error encoding query: {e}")
142
+ return []
143
+
144
+ try:
145
  results = collection.query(
146
  query_embeddings=query_embedding,
147
+ n_results=top_k
148
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  except Exception as e:
150
+ st.error(f"Error querying database: {e}")
151
  return []
152
 
153
+ documents = results.get("documents", [[]])[0]
154
+ metadatas = results.get("metadatas", [[]])[0]
155
+ dists = results.get("distances", [[]])[0] if "distances" in results else []
156
 
157
+ relevant_chunks = []
158
+ for i, doc in enumerate(documents):
159
+ meta = metadatas[i] if i < len(metadatas) else {}
160
+ distance = dists[i] if i < len(dists) else None
 
 
 
 
161
 
162
+ if distance is None:
163
+ similarity = 1.0
164
+ elif isinstance(distance, (int, float)) and distance <= 1:
165
+ similarity = max(0, 1 - distance)
166
+ else:
167
+ similarity = float(distance)
168
 
169
+ relevant_chunks.append({
170
+ "text": doc,
171
+ "page_number": meta.get("page_number", "N/A"),
172
+ "similarity": similarity
173
+ })
 
 
174
 
175
+ return relevant_chunks
 
 
 
 
 
176
 
177
 
178
  def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str:
179
+ """Generate answer from Groq LLM using retrieved context."""
 
 
 
 
 
180
  try:
 
181
  context_parts = [f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks]
182
+ context = "\n\n".join(context_parts) if context_parts else ""
 
 
183
 
184
  prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question.
185
 
 
197
 
198
  Answer:"""
199
 
200
+ if hasattr(client, "chat") and hasattr(client.chat, "completions"):
201
+ chat_resp = client.chat.completions.create(
202
+ model="llama3-8b-8192",
203
+ messages=[
204
+ {"role": "system", "content": "You are a strict assistant that only uses provided context."},
205
+ {"role": "user", "content": prompt}
206
+ ],
207
+ temperature=0.1,
208
+ max_tokens=500
209
+ )
210
+ else:
211
+ chat_resp = client.create(prompt=prompt, max_tokens=500)
212
 
213
+ if hasattr(chat_resp, "choices"):
 
 
214
  return chat_resp.choices[0].message.content
215
  elif isinstance(chat_resp, dict):
216
+ choices = chat_resp.get("choices") or []
 
217
  if choices:
218
+ return choices[0].get("message", {}).get("content") \
219
+ or choices[0].get("text") \
220
+ or str(choices[0])
221
  return str(chat_resp)
222
 
223
  except Exception as e:
224
  return f"Error generating answer: {e}"
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ # -----------------------------
228
+ # Streamlit UI
229
+ # -----------------------------
230
  def main():
231
+ st.set_page_config(page_title="PDF Chatbot with Groq", layout="wide")
232
+ st.title("📚 PDF Chatbot with Groq")
233
+
234
+ st.sidebar.header("Upload PDF")
235
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
236
+
237
+ if uploaded_file:
238
+ if "processed_file" not in st.session_state or \
239
+ st.session_state.processed_file != uploaded_file.name:
240
+ with st.spinner("Processing PDF..."):
241
+ embedding_model = load_embedding_model()
242
+ chunks = pdf_to_chunks(uploaded_file)
243
+
244
+ if not chunks:
245
+ st.error("No text extracted from PDF.")
246
+ return
247
+
248
+ collection_name = create_vector_database(chunks, embedding_model)
249
+ if collection_name:
250
+ st.session_state.processed_file = uploaded_file.name
251
+ st.success("PDF processed and vector database created!")
252
+
253
+ st.sidebar.header("Ask a Question")
254
+ query = st.sidebar.text_input("Enter your question:")
255
+
256
+ if query:
257
+ if "collection_name" not in st.session_state:
258
+ st.warning("Please upload and process a PDF first.")
259
+ else:
260
+ embedding_model = load_embedding_model()
261
+ groq_client = setup_groq()
262
+ if groq_client:
263
+ with st.spinner("Generating answer..."):
264
+ relevant_chunks = query_vector_database(query, embedding_model)
265
+ if not relevant_chunks:
266
+ st.error("No relevant chunks found.")
267
+ return
268
+ answer = generate_answer_with_groq(groq_client, query, relevant_chunks)
269
+ st.subheader("Answer:")
270
+ st.write(answer)
271
+
272
+ st.subheader("Relevant Chunks:")
273
+ for chunk in relevant_chunks:
274
+ st.markdown(
275
+ f"**Page {chunk['page_number']} (Score: {chunk['similarity']:.2f})**\n\n"
276
+ f"{chunk['text'][:500]}..."
277
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  if __name__ == "__main__":
281
  main()