SurajJha21 commited on
Commit
45899de
·
verified ·
1 Parent(s): 76d8983

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +525 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import google.generativeai as genai
5
+ from pypdf import PdfReader
6
+ from pinecone import Pinecone
7
+ import uuid
8
+ import time
9
+ import json
10
+ from dotenv import load_dotenv
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Configuration
17
+ st.set_page_config(page_title="PDF Learning Assistant", layout="wide")
18
+
19
+ # Initialize session state
20
+ if "messages" not in st.session_state:
21
+ st.session_state.messages = []
22
+ if "current_pdf_content" not in st.session_state:
23
+ st.session_state.current_pdf_content = ""
24
+ if "current_pdf_name" not in st.session_state:
25
+ st.session_state.current_pdf_name = ""
26
+ if "index_name" not in st.session_state:
27
+ st.session_state.index_name = "index1" # Using your specific index name
28
+ if "is_initialized" not in st.session_state:
29
+ st.session_state.is_initialized = False
30
+ if "index_dimensions" not in st.session_state:
31
+ st.session_state.index_dimensions = 1024 # Set this based on your Pinecone index
32
+
33
+ # Functions for PDF processing
34
+ def extract_text_from_pdf(pdf_file):
35
+ reader = PdfReader(pdf_file)
36
+ text = ""
37
+ for page in reader.pages:
38
+ text += page.extract_text() + "\n"
39
+ return text
40
+
41
+ def chunk_text(text, chunk_size=1000, overlap=200):
42
+ chunks = []
43
+ start = 0
44
+ text_length = len(text)
45
+
46
+ while start < text_length:
47
+ end = min(start + chunk_size, text_length)
48
+ if end < text_length and end - start == chunk_size:
49
+ # Find the last period or newline to make more natural chunks
50
+ last_period = text.rfind('.', start, end)
51
+ last_newline = text.rfind('\n', start, end)
52
+ if last_period > start + chunk_size // 2:
53
+ end = last_period + 1
54
+ elif last_newline > start + chunk_size // 2:
55
+ end = last_newline + 1
56
+
57
+ chunks.append(text[start:end])
58
+ start = end - overlap if end < text_length else text_length
59
+
60
+ return chunks
61
+
62
+ # Embeddings and Vector Store functions
63
+ @st.cache_resource
64
+ def get_embedding_model():
65
+ # Using a model that produces 1024-dimensional embeddings
66
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/all-roberta-large-v1")
67
+
68
+
69
+ def initialize_pinecone():
70
+ # Get API key from environment variables
71
+ api_key = os.getenv("PINECONE_API_KEY")
72
+
73
+ if not api_key:
74
+ st.error("Pinecone API key not found. Please add it to your .env file as PINECONE_API_KEY=your_api_key")
75
+ return False
76
+
77
+ try:
78
+ # Initialize Pinecone with your specific configuration
79
+ pc = Pinecone(api_key=api_key)
80
+
81
+ # Store Pinecone client in session state
82
+ st.session_state.pinecone_client = pc
83
+
84
+ # Check if your index exists
85
+ index_list = [idx.name for idx in pc.list_indexes()]
86
+ if st.session_state.index_name not in index_list:
87
+ st.error(f"Index '{st.session_state.index_name}' not found in your Pinecone account.")
88
+ st.info("Available indexes: " + ", ".join(index_list))
89
+ return False
90
+
91
+ # Get index details to check dimensions
92
+ try:
93
+ index = pc.Index(st.session_state.index_name)
94
+ index_stats = index.describe_index_stats()
95
+ if 'dimension' in index_stats:
96
+ st.session_state.index_dimensions = index_stats['dimension']
97
+ st.info(f"Detected index dimension: {st.session_state.index_dimensions}")
98
+ else:
99
+ st.warning("Could not detect index dimensions. Using default: 1024")
100
+ except Exception as e:
101
+ st.warning(f"Could not get index details: {str(e)}")
102
+
103
+ return True
104
+ except Exception as e:
105
+ st.error(f"Error initializing Pinecone: {str(e)}")
106
+ return False
107
+
108
+ def get_pinecone_index():
109
+ # Connect to your existing index
110
+ return st.session_state.pinecone_client.Index(st.session_state.index_name)
111
+
112
+ def embed_chunks(chunks):
113
+ model = get_embedding_model()
114
+ embeddings = []
115
+ for chunk in chunks:
116
+ # HuggingFaceEmbeddings returns a list with a single embedding
117
+ embed = model.embed_documents([chunk])[0]
118
+ embeddings.append(embed)
119
+ return embeddings
120
+
121
+ def store_embeddings(chunks, embeddings, pdf_name):
122
+ index = get_pinecone_index()
123
+ batch_size = 100
124
+
125
+ for i in range(0, len(chunks), batch_size):
126
+ i_end = min(i + batch_size, len(chunks))
127
+ ids = [f"{pdf_name}-{uuid.uuid4()}" for _ in range(i, i_end)]
128
+ metadata = [{"text": chunks[j], "pdf_name": pdf_name, "chunk_id": j} for j in range(i, i_end)]
129
+ vectors = [(ids[j-i], embeddings[j], metadata[j-i]) for j in range(i, i_end)]
130
+
131
+ try:
132
+ index.upsert(vectors=vectors)
133
+ st.success(f"Successfully stored batch {i//batch_size + 1} of chunks to Pinecone")
134
+ except Exception as e:
135
+ st.error(f"Error storing embeddings: {str(e)}")
136
+ # Display the first embedding's dimension for debugging
137
+ if embeddings and len(embeddings) > 0:
138
+ st.info(f"Embedding dimension: {len(embeddings[0])}")
139
+ return False
140
+
141
+ st.success(f"Successfully stored all {len(chunks)} chunks to Pinecone")
142
+ return True
143
+
144
+ def search_similar_chunks(query, top_k=5, pdf_name=None):
145
+ model = get_embedding_model()
146
+ query_embedding = model.embed_query(query)
147
+ index = get_pinecone_index()
148
+
149
+ filter_query = {"pdf_name": pdf_name} if pdf_name else None
150
+
151
+ results = index.query(
152
+ vector=query_embedding,
153
+ top_k=top_k,
154
+ include_metadata=True,
155
+ filter=filter_query
156
+ )
157
+
158
+ return results.matches
159
+
160
+ # Gemini LLM Integration
161
+ @st.cache_resource
162
+ def initialize_gemini():
163
+ api_key = os.getenv("GOOGLE_API_KEY")
164
+
165
+ if not api_key:
166
+ st.error("Google API key not found. Please add it to your .env file as GOOGLE_API_KEY=your_api_key")
167
+ return False
168
+
169
+ try:
170
+ genai.configure(api_key=api_key)
171
+ return True
172
+ except Exception as e:
173
+ st.error(f"Error initializing Google Generative AI: {str(e)}")
174
+ return False
175
+
176
+ def get_gemini_response(prompt, context=None, temperature=0.7):
177
+ try:
178
+ model = genai.GenerativeModel('gemini-2.0-flash')
179
+
180
+ if context:
181
+ full_prompt = f"""
182
+ Context information:
183
+ {context}
184
+
185
+ Question: {prompt}
186
+
187
+ Please provide a helpful, accurate response based on the context information provided.
188
+ If the answer cannot be determined from the context, please state that clearly.
189
+ """
190
+ else:
191
+ full_prompt = prompt
192
+
193
+ response = model.generate_content(full_prompt, generation_config={"temperature": temperature})
194
+ return response.text
195
+ except Exception as e:
196
+ st.error(f"Error getting response from Gemini: {str(e)}")
197
+ return "Sorry, I couldn't generate a response at this time."
198
+
199
+ # Quiz and Assignment Generation
200
+ def generate_quiz(pdf_content, num_questions=5):
201
+ prompt = f"""
202
+ Based on the following content, generate a quiz with {num_questions} multiple-choice questions.
203
+ For each question, provide 4 options and indicate the correct answer.
204
+ Format the response as a JSON array of question objects with the structure:
205
+ [
206
+ {{
207
+ "question": "Question text",
208
+ "options": ["Option A", "Option B", "Option C", "Option D"],
209
+ "correct_answer": "Correct option (A, B, C, or D)",
210
+ "explanation": "Brief explanation of why this is the correct answer"
211
+ }},
212
+ // more questions...
213
+ ]
214
+
215
+ Content: {pdf_content[:2000]}... (truncated for brevity)
216
+ """
217
+
218
+ response = get_gemini_response(prompt, temperature=0.2)
219
+
220
+ try:
221
+ # Extract JSON from response if it's embedded in markdown or text
222
+ if "```json" in response:
223
+ json_start = response.find("```json") + 7
224
+ json_end = response.find("```", json_start)
225
+ json_str = response[json_start:json_end].strip()
226
+ elif "```" in response:
227
+ json_start = response.find("```") + 3
228
+ json_end = response.find("```", json_start)
229
+ json_str = response[json_start:json_end].strip()
230
+ else:
231
+ json_str = response
232
+
233
+ quiz_data = json.loads(json_str)
234
+ return quiz_data
235
+ except Exception as e:
236
+ st.error(f"Error parsing quiz response: {str(e)}")
237
+ return []
238
+
239
+ def generate_assignment(pdf_content, assignment_type="short_answer", num_questions=3):
240
+ prompt = f"""
241
+ Based on the following content, generate a {assignment_type} assignment with {num_questions} questions.
242
+ If the assignment type is 'short_answer', create questions that require brief explanations.
243
+ If the assignment type is 'essay', create deeper questions that require longer responses.
244
+ If the assignment type is 'research', create questions that encourage further exploration of the topics.
245
+
246
+ Format the response as a JSON array with the structure:
247
+ [
248
+ {{
249
+ "question": "Question text",
250
+ "hints": ["Hint 1", "Hint 2"],
251
+ "key_points": ["Key point 1", "Key point 2", "Key point 3"]
252
+ }},
253
+ // more questions...
254
+ ]
255
+
256
+ Content: {pdf_content[:2000]}... (truncated for brevity)
257
+ """
258
+
259
+ response = get_gemini_response(prompt, temperature=0.3)
260
+
261
+ try:
262
+ # Extract JSON from response if it's embedded in markdown or text
263
+ if "```json" in response:
264
+ json_start = response.find("```json") + 7
265
+ json_end = response.find("```", json_start)
266
+ json_str = response[json_start:json_end].strip()
267
+ elif "```" in response:
268
+ json_start = response.find("```") + 3
269
+ json_end = response.find("```", json_start)
270
+ json_str = response[json_start:json_end].strip()
271
+ else:
272
+ json_str = response
273
+
274
+ assignment_data = json.loads(json_str)
275
+ return assignment_data
276
+ except Exception as e:
277
+ st.error(f"Error parsing assignment response: {str(e)}")
278
+ return []
279
+
280
+ # Streamlit UI
281
+ def main():
282
+ st.title("📚 PDF Learning Assistant")
283
+
284
+ # Initialize services
285
+ if not st.session_state.is_initialized:
286
+ with st.spinner("Initializing services..."):
287
+ pinecone_init = initialize_pinecone()
288
+ gemini_init = initialize_gemini()
289
+
290
+ if pinecone_init and gemini_init:
291
+ st.session_state.is_initialized = True
292
+ st.success("Services initialized successfully!")
293
+
294
+ # Display Pinecone connection info
295
+ st.info(f"""
296
+ Connected to Pinecone index:
297
+ - Index name: {st.session_state.index_name}
298
+ - Dimension: {st.session_state.index_dimensions}
299
+ - Host: https://index1-mwog0w0.svc.aped-4627-b74a.pinecone.io
300
+ - Region: us-east-1
301
+ - Type: Dense
302
+ - Capacity: Serverless
303
+ """)
304
+ else:
305
+ st.error("Failed to initialize all required services. Please check your API keys in the .env file.")
306
+
307
+ # Show .env file template
308
+ st.code("""
309
+ # Create a .env file in the same directory with the following content:
310
+ PINECONE_API_KEY=your_pinecone_api_key
311
+ GOOGLE_API_KEY=your_google_api_key
312
+ """)
313
+ return
314
+
315
+ # Sidebar for PDF upload and main actions
316
+ with st.sidebar:
317
+ st.header("Upload PDF")
318
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
319
+
320
+ if uploaded_file:
321
+ with st.spinner("Processing PDF..."):
322
+ # Save uploaded file to temp location
323
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
324
+ temp_file.write(uploaded_file.getvalue())
325
+ temp_path = temp_file.name
326
+
327
+ # Extract text
328
+ pdf_text = extract_text_from_pdf(temp_path)
329
+ os.unlink(temp_path) # Delete temp file
330
+
331
+ # Store in session state
332
+ st.session_state.current_pdf_content = pdf_text
333
+ st.session_state.current_pdf_name = uploaded_file.name
334
+
335
+ # Chunk and embed
336
+ chunks = chunk_text(pdf_text)
337
+ embeddings = embed_chunks(chunks)
338
+
339
+ # Store in Pinecone
340
+ success = store_embeddings(chunks, embeddings, st.session_state.current_pdf_name)
341
+
342
+ if success:
343
+ st.success(f"Successfully processed {uploaded_file.name}")
344
+ else:
345
+ st.error(f"Failed to process {uploaded_file.name}")
346
+
347
+ st.divider()
348
+ st.header("Learning Tools")
349
+
350
+ # Only enable these buttons if a PDF is loaded
351
+ if st.session_state.current_pdf_content:
352
+ # Quiz generation
353
+ quiz_questions = st.slider("Number of quiz questions", min_value=3, max_value=10, value=5)
354
+ if st.button("Generate Quiz"):
355
+ with st.spinner("Generating quiz..."):
356
+ quiz_data = generate_quiz(st.session_state.current_pdf_content, num_questions=quiz_questions)
357
+ st.session_state.quiz_data = quiz_data
358
+
359
+ # Assignment generation
360
+ assignment_type = st.selectbox(
361
+ "Assignment Type",
362
+ ["short_answer", "essay", "research"]
363
+ )
364
+ assignment_questions = st.slider("Number of assignment questions", min_value=1, max_value=5, value=3)
365
+
366
+ if st.button("Generate Assignment"):
367
+ with st.spinner("Generating assignment..."):
368
+ assignment_data = generate_assignment(
369
+ st.session_state.current_pdf_content,
370
+ assignment_type,
371
+ num_questions=assignment_questions
372
+ )
373
+ st.session_state.assignment_data = assignment_data
374
+ else:
375
+ st.info("Please upload a PDF first to use these features")
376
+
377
+ # Main content area
378
+ tab1, tab2, tab3 = st.tabs(["Chatbot", "Quiz", "Assignment"])
379
+
380
+ # Tab 1: Chatbot
381
+ with tab1:
382
+ st.header("Chat with your PDF")
383
+
384
+ # Display chat messages
385
+ for message in st.session_state.messages:
386
+ with st.chat_message(message["role"]):
387
+ st.write(message["content"])
388
+
389
+ # Chat input
390
+ if st.session_state.current_pdf_content:
391
+ user_input = st.chat_input("Ask a question about your PDF...")
392
+
393
+ if user_input:
394
+ # Add user message to chat history
395
+ st.session_state.messages.append({"role": "user", "content": user_input})
396
+ with st.chat_message("user"):
397
+ st.write(user_input)
398
+
399
+ # Generate response
400
+ with st.chat_message("assistant"):
401
+ with st.spinner("Thinking..."):
402
+ # Search for relevant context
403
+ similar_chunks = search_similar_chunks(
404
+ user_input,
405
+ top_k=3,
406
+ pdf_name=st.session_state.current_pdf_name
407
+ )
408
+
409
+ # Extract text from results
410
+ context = "\n\n".join([match.metadata["text"] for match in similar_chunks])
411
+
412
+ # Get response from Gemini
413
+ response = get_gemini_response(user_input, context)
414
+
415
+ st.write(response)
416
+
417
+ # Add assistant message to chat history
418
+ st.session_state.messages.append({"role": "assistant", "content": response})
419
+ else:
420
+ st.info("Please upload a PDF to start chatting")
421
+
422
+ # Tab 2: Quiz
423
+ with tab2:
424
+ st.header("Quiz")
425
+
426
+ if "quiz_data" in st.session_state and st.session_state.quiz_data:
427
+ quiz_data = st.session_state.quiz_data
428
+
429
+ if "quiz_answers" not in st.session_state:
430
+ st.session_state.quiz_answers = {}
431
+ st.session_state.quiz_submitted = False
432
+
433
+ if not st.session_state.quiz_submitted:
434
+ for i, question in enumerate(quiz_data):
435
+ st.subheader(f"Question {i+1}")
436
+ st.write(question["question"])
437
+
438
+ options = question["options"]
439
+ option_labels = ["A", "B", "C", "D"]
440
+
441
+ # Create radio buttons for options
442
+ answer = st.radio(
443
+ "Select your answer:",
444
+ options=option_labels[:len(options)],
445
+ key=f"q{i}",
446
+ index=None
447
+ )
448
+
449
+ # Display options
450
+ for j, option in enumerate(options):
451
+ st.write(f"{option_labels[j]}: {option}")
452
+
453
+ st.session_state.quiz_answers[i] = answer
454
+ st.divider()
455
+
456
+ if st.button("Submit Quiz"):
457
+ st.session_state.quiz_submitted = True
458
+ st.experimental_rerun()
459
+ else:
460
+ # Show results
461
+ correct_count = 0
462
+
463
+ for i, question in enumerate(quiz_data):
464
+ st.subheader(f"Question {i+1}")
465
+ st.write(question["question"])
466
+
467
+ options = question["options"]
468
+ option_labels = ["A", "B", "C", "D"]
469
+ correct_letter = question["correct_answer"]
470
+ user_answer = st.session_state.quiz_answers.get(i)
471
+
472
+ # Display options with correct/incorrect indicators
473
+ for j, option in enumerate(options):
474
+ current_label = option_labels[j]
475
+ if current_label == correct_letter:
476
+ st.success(f"{current_label}: {option} ✓")
477
+ if user_answer == current_label:
478
+ correct_count += 1
479
+ elif user_answer == current_label:
480
+ st.error(f"{current_label}: {option} ✗")
481
+ else:
482
+ st.write(f"{current_label}: {option}")
483
+
484
+ # Show explanation
485
+ st.info(f"Explanation: {question['explanation']}")
486
+ st.divider()
487
+
488
+ st.subheader(f"Your Score: {correct_count}/{len(quiz_data)}")
489
+
490
+ if st.button("Retake Quiz"):
491
+ st.session_state.quiz_submitted = False
492
+ st.session_state.quiz_answers = {}
493
+ st.experimental_rerun()
494
+ else:
495
+ st.info("Generate a quiz from the sidebar to see it here")
496
+
497
+ # Tab 3: Assignment
498
+ with tab3:
499
+ st.header("Assignment")
500
+
501
+ if "assignment_data" in st.session_state and st.session_state.assignment_data:
502
+ assignment_data = st.session_state.assignment_data
503
+
504
+ for i, question in enumerate(assignment_data):
505
+ with st.expander(f"Question {i+1}", expanded=True):
506
+ st.write(question["question"])
507
+
508
+ if "hints" in question and question["hints"]:
509
+ st.subheader("Hints")
510
+ for hint in question["hints"]:
511
+ st.write(f"- {hint}")
512
+
513
+ # Input area for answers
514
+ st.text_area("Your Answer:", key=f"assignment_q{i}", height=150)
515
+
516
+ # Reveal key points button
517
+ if st.button("Show Key Points", key=f"key_points_btn_{i}"):
518
+ st.subheader("Key Points to Include")
519
+ for point in question["key_points"]:
520
+ st.write(f"- {point}")
521
+ else:
522
+ st.info("Generate an assignment from the sidebar to see it here")
523
+
524
+ if __name__ == "__main__":
525
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pypdf
3
+ sentence-transformers
4
+ pinecone
5
+ google-generativeai