SurajJha21 commited on
Commit
c125ec2
·
verified ·
1 Parent(s): 54126f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +552 -525
app.py CHANGED
@@ -1,525 +1,552 @@
1
- import streamlit as st
2
- import os
3
- import tempfile
4
- import google.generativeai as genai
5
- from pypdf import PdfReader
6
- from pinecone import Pinecone
7
- import uuid
8
- import time
9
- import json
10
- from dotenv import load_dotenv
11
- from langchain_community.embeddings import HuggingFaceEmbeddings
12
-
13
- # Load environment variables from .env file
14
- load_dotenv()
15
-
16
- # Configuration
17
- st.set_page_config(page_title="PDF Learning Assistant", layout="wide")
18
-
19
- # Initialize session state
20
- if "messages" not in st.session_state:
21
- st.session_state.messages = []
22
- if "current_pdf_content" not in st.session_state:
23
- st.session_state.current_pdf_content = ""
24
- if "current_pdf_name" not in st.session_state:
25
- st.session_state.current_pdf_name = ""
26
- if "index_name" not in st.session_state:
27
- st.session_state.index_name = "index1" # Using your specific index name
28
- if "is_initialized" not in st.session_state:
29
- st.session_state.is_initialized = False
30
- if "index_dimensions" not in st.session_state:
31
- st.session_state.index_dimensions = 1024 # Set this based on your Pinecone index
32
-
33
- # Functions for PDF processing
34
- def extract_text_from_pdf(pdf_file):
35
- reader = PdfReader(pdf_file)
36
- text = ""
37
- for page in reader.pages:
38
- text += page.extract_text() + "\n"
39
- return text
40
-
41
- def chunk_text(text, chunk_size=1000, overlap=200):
42
- chunks = []
43
- start = 0
44
- text_length = len(text)
45
-
46
- while start < text_length:
47
- end = min(start + chunk_size, text_length)
48
- if end < text_length and end - start == chunk_size:
49
- # Find the last period or newline to make more natural chunks
50
- last_period = text.rfind('.', start, end)
51
- last_newline = text.rfind('\n', start, end)
52
- if last_period > start + chunk_size // 2:
53
- end = last_period + 1
54
- elif last_newline > start + chunk_size // 2:
55
- end = last_newline + 1
56
-
57
- chunks.append(text[start:end])
58
- start = end - overlap if end < text_length else text_length
59
-
60
- return chunks
61
-
62
- # Embeddings and Vector Store functions
63
- @st.cache_resource
64
- def get_embedding_model():
65
- # Using a model that produces 1024-dimensional embeddings
66
- return HuggingFaceEmbeddings(model_name="sentence-transformers/all-roberta-large-v1")
67
-
68
-
69
- def initialize_pinecone():
70
- # Get API key from environment variables
71
- api_key = os.getenv("PINECONE_API_KEY")
72
-
73
- if not api_key:
74
- st.error("Pinecone API key not found. Please add it to your .env file as PINECONE_API_KEY=your_api_key")
75
- return False
76
-
77
- try:
78
- # Initialize Pinecone with your specific configuration
79
- pc = Pinecone(api_key=api_key)
80
-
81
- # Store Pinecone client in session state
82
- st.session_state.pinecone_client = pc
83
-
84
- # Check if your index exists
85
- index_list = [idx.name for idx in pc.list_indexes()]
86
- if st.session_state.index_name not in index_list:
87
- st.error(f"Index '{st.session_state.index_name}' not found in your Pinecone account.")
88
- st.info("Available indexes: " + ", ".join(index_list))
89
- return False
90
-
91
- # Get index details to check dimensions
92
- try:
93
- index = pc.Index(st.session_state.index_name)
94
- index_stats = index.describe_index_stats()
95
- if 'dimension' in index_stats:
96
- st.session_state.index_dimensions = index_stats['dimension']
97
- st.info(f"Detected index dimension: {st.session_state.index_dimensions}")
98
- else:
99
- st.warning("Could not detect index dimensions. Using default: 1024")
100
- except Exception as e:
101
- st.warning(f"Could not get index details: {str(e)}")
102
-
103
- return True
104
- except Exception as e:
105
- st.error(f"Error initializing Pinecone: {str(e)}")
106
- return False
107
-
108
- def get_pinecone_index():
109
- # Connect to your existing index
110
- return st.session_state.pinecone_client.Index(st.session_state.index_name)
111
-
112
- def embed_chunks(chunks):
113
- model = get_embedding_model()
114
- embeddings = []
115
- for chunk in chunks:
116
- # HuggingFaceEmbeddings returns a list with a single embedding
117
- embed = model.embed_documents([chunk])[0]
118
- embeddings.append(embed)
119
- return embeddings
120
-
121
- def store_embeddings(chunks, embeddings, pdf_name):
122
- index = get_pinecone_index()
123
- batch_size = 100
124
-
125
- for i in range(0, len(chunks), batch_size):
126
- i_end = min(i + batch_size, len(chunks))
127
- ids = [f"{pdf_name}-{uuid.uuid4()}" for _ in range(i, i_end)]
128
- metadata = [{"text": chunks[j], "pdf_name": pdf_name, "chunk_id": j} for j in range(i, i_end)]
129
- vectors = [(ids[j-i], embeddings[j], metadata[j-i]) for j in range(i, i_end)]
130
-
131
- try:
132
- index.upsert(vectors=vectors)
133
- st.success(f"Successfully stored batch {i//batch_size + 1} of chunks to Pinecone")
134
- except Exception as e:
135
- st.error(f"Error storing embeddings: {str(e)}")
136
- # Display the first embedding's dimension for debugging
137
- if embeddings and len(embeddings) > 0:
138
- st.info(f"Embedding dimension: {len(embeddings[0])}")
139
- return False
140
-
141
- st.success(f"Successfully stored all {len(chunks)} chunks to Pinecone")
142
- return True
143
-
144
- def search_similar_chunks(query, top_k=5, pdf_name=None):
145
- model = get_embedding_model()
146
- query_embedding = model.embed_query(query)
147
- index = get_pinecone_index()
148
-
149
- filter_query = {"pdf_name": pdf_name} if pdf_name else None
150
-
151
- results = index.query(
152
- vector=query_embedding,
153
- top_k=top_k,
154
- include_metadata=True,
155
- filter=filter_query
156
- )
157
-
158
- return results.matches
159
-
160
- # Gemini LLM Integration
161
- @st.cache_resource
162
- def initialize_gemini():
163
- api_key = os.getenv("GOOGLE_API_KEY")
164
-
165
- if not api_key:
166
- st.error("Google API key not found. Please add it to your .env file as GOOGLE_API_KEY=your_api_key")
167
- return False
168
-
169
- try:
170
- genai.configure(api_key=api_key)
171
- return True
172
- except Exception as e:
173
- st.error(f"Error initializing Google Generative AI: {str(e)}")
174
- return False
175
-
176
- def get_gemini_response(prompt, context=None, temperature=0.7):
177
- try:
178
- model = genai.GenerativeModel('gemini-2.0-flash')
179
-
180
- if context:
181
- full_prompt = f"""
182
- Context information:
183
- {context}
184
-
185
- Question: {prompt}
186
-
187
- Please provide a helpful, accurate response based on the context information provided.
188
- If the answer cannot be determined from the context, please state that clearly.
189
- """
190
- else:
191
- full_prompt = prompt
192
-
193
- response = model.generate_content(full_prompt, generation_config={"temperature": temperature})
194
- return response.text
195
- except Exception as e:
196
- st.error(f"Error getting response from Gemini: {str(e)}")
197
- return "Sorry, I couldn't generate a response at this time."
198
-
199
- # Quiz and Assignment Generation
200
- def generate_quiz(pdf_content, num_questions=5):
201
- prompt = f"""
202
- Based on the following content, generate a quiz with {num_questions} multiple-choice questions.
203
- For each question, provide 4 options and indicate the correct answer.
204
- Format the response as a JSON array of question objects with the structure:
205
- [
206
- {{
207
- "question": "Question text",
208
- "options": ["Option A", "Option B", "Option C", "Option D"],
209
- "correct_answer": "Correct option (A, B, C, or D)",
210
- "explanation": "Brief explanation of why this is the correct answer"
211
- }},
212
- // more questions...
213
- ]
214
-
215
- Content: {pdf_content[:2000]}... (truncated for brevity)
216
- """
217
-
218
- response = get_gemini_response(prompt, temperature=0.2)
219
-
220
- try:
221
- # Extract JSON from response if it's embedded in markdown or text
222
- if "```json" in response:
223
- json_start = response.find("```json") + 7
224
- json_end = response.find("```", json_start)
225
- json_str = response[json_start:json_end].strip()
226
- elif "```" in response:
227
- json_start = response.find("```") + 3
228
- json_end = response.find("```", json_start)
229
- json_str = response[json_start:json_end].strip()
230
- else:
231
- json_str = response
232
-
233
- quiz_data = json.loads(json_str)
234
- return quiz_data
235
- except Exception as e:
236
- st.error(f"Error parsing quiz response: {str(e)}")
237
- return []
238
-
239
- def generate_assignment(pdf_content, assignment_type="short_answer", num_questions=3):
240
- prompt = f"""
241
- Based on the following content, generate a {assignment_type} assignment with {num_questions} questions.
242
- If the assignment type is 'short_answer', create questions that require brief explanations.
243
- If the assignment type is 'essay', create deeper questions that require longer responses.
244
- If the assignment type is 'research', create questions that encourage further exploration of the topics.
245
-
246
- Format the response as a JSON array with the structure:
247
- [
248
- {{
249
- "question": "Question text",
250
- "hints": ["Hint 1", "Hint 2"],
251
- "key_points": ["Key point 1", "Key point 2", "Key point 3"]
252
- }},
253
- // more questions...
254
- ]
255
-
256
- Content: {pdf_content[:2000]}... (truncated for brevity)
257
- """
258
-
259
- response = get_gemini_response(prompt, temperature=0.3)
260
-
261
- try:
262
- # Extract JSON from response if it's embedded in markdown or text
263
- if "```json" in response:
264
- json_start = response.find("```json") + 7
265
- json_end = response.find("```", json_start)
266
- json_str = response[json_start:json_end].strip()
267
- elif "```" in response:
268
- json_start = response.find("```") + 3
269
- json_end = response.find("```", json_start)
270
- json_str = response[json_start:json_end].strip()
271
- else:
272
- json_str = response
273
-
274
- assignment_data = json.loads(json_str)
275
- return assignment_data
276
- except Exception as e:
277
- st.error(f"Error parsing assignment response: {str(e)}")
278
- return []
279
-
280
- # Streamlit UI
281
- def main():
282
- st.title("📚 PDF Learning Assistant")
283
-
284
- # Initialize services
285
- if not st.session_state.is_initialized:
286
- with st.spinner("Initializing services..."):
287
- pinecone_init = initialize_pinecone()
288
- gemini_init = initialize_gemini()
289
-
290
- if pinecone_init and gemini_init:
291
- st.session_state.is_initialized = True
292
- st.success("Services initialized successfully!")
293
-
294
- # Display Pinecone connection info
295
- st.info(f"""
296
- Connected to Pinecone index:
297
- - Index name: {st.session_state.index_name}
298
- - Dimension: {st.session_state.index_dimensions}
299
- - Host: https://index1-mwog0w0.svc.aped-4627-b74a.pinecone.io
300
- - Region: us-east-1
301
- - Type: Dense
302
- - Capacity: Serverless
303
- """)
304
- else:
305
- st.error("Failed to initialize all required services. Please check your API keys in the .env file.")
306
-
307
- # Show .env file template
308
- st.code("""
309
- # Create a .env file in the same directory with the following content:
310
- PINECONE_API_KEY=your_pinecone_api_key
311
- GOOGLE_API_KEY=your_google_api_key
312
- """)
313
- return
314
-
315
- # Sidebar for PDF upload and main actions
316
- with st.sidebar:
317
- st.header("Upload PDF")
318
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
319
-
320
- if uploaded_file:
321
- with st.spinner("Processing PDF..."):
322
- # Save uploaded file to temp location
323
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
324
- temp_file.write(uploaded_file.getvalue())
325
- temp_path = temp_file.name
326
-
327
- # Extract text
328
- pdf_text = extract_text_from_pdf(temp_path)
329
- os.unlink(temp_path) # Delete temp file
330
-
331
- # Store in session state
332
- st.session_state.current_pdf_content = pdf_text
333
- st.session_state.current_pdf_name = uploaded_file.name
334
-
335
- # Chunk and embed
336
- chunks = chunk_text(pdf_text)
337
- embeddings = embed_chunks(chunks)
338
-
339
- # Store in Pinecone
340
- success = store_embeddings(chunks, embeddings, st.session_state.current_pdf_name)
341
-
342
- if success:
343
- st.success(f"Successfully processed {uploaded_file.name}")
344
- else:
345
- st.error(f"Failed to process {uploaded_file.name}")
346
-
347
- st.divider()
348
- st.header("Learning Tools")
349
-
350
- # Only enable these buttons if a PDF is loaded
351
- if st.session_state.current_pdf_content:
352
- # Quiz generation
353
- quiz_questions = st.slider("Number of quiz questions", min_value=3, max_value=10, value=5)
354
- if st.button("Generate Quiz"):
355
- with st.spinner("Generating quiz..."):
356
- quiz_data = generate_quiz(st.session_state.current_pdf_content, num_questions=quiz_questions)
357
- st.session_state.quiz_data = quiz_data
358
-
359
- # Assignment generation
360
- assignment_type = st.selectbox(
361
- "Assignment Type",
362
- ["short_answer", "essay", "research"]
363
- )
364
- assignment_questions = st.slider("Number of assignment questions", min_value=1, max_value=5, value=3)
365
-
366
- if st.button("Generate Assignment"):
367
- with st.spinner("Generating assignment..."):
368
- assignment_data = generate_assignment(
369
- st.session_state.current_pdf_content,
370
- assignment_type,
371
- num_questions=assignment_questions
372
- )
373
- st.session_state.assignment_data = assignment_data
374
- else:
375
- st.info("Please upload a PDF first to use these features")
376
-
377
- # Main content area
378
- tab1, tab2, tab3 = st.tabs(["Chatbot", "Quiz", "Assignment"])
379
-
380
- # Tab 1: Chatbot
381
- with tab1:
382
- st.header("Chat with your PDF")
383
-
384
- # Display chat messages
385
- for message in st.session_state.messages:
386
- with st.chat_message(message["role"]):
387
- st.write(message["content"])
388
-
389
- # Chat input
390
- if st.session_state.current_pdf_content:
391
- user_input = st.chat_input("Ask a question about your PDF...")
392
-
393
- if user_input:
394
- # Add user message to chat history
395
- st.session_state.messages.append({"role": "user", "content": user_input})
396
- with st.chat_message("user"):
397
- st.write(user_input)
398
-
399
- # Generate response
400
- with st.chat_message("assistant"):
401
- with st.spinner("Thinking..."):
402
- # Search for relevant context
403
- similar_chunks = search_similar_chunks(
404
- user_input,
405
- top_k=3,
406
- pdf_name=st.session_state.current_pdf_name
407
- )
408
-
409
- # Extract text from results
410
- context = "\n\n".join([match.metadata["text"] for match in similar_chunks])
411
-
412
- # Get response from Gemini
413
- response = get_gemini_response(user_input, context)
414
-
415
- st.write(response)
416
-
417
- # Add assistant message to chat history
418
- st.session_state.messages.append({"role": "assistant", "content": response})
419
- else:
420
- st.info("Please upload a PDF to start chatting")
421
-
422
- # Tab 2: Quiz
423
- with tab2:
424
- st.header("Quiz")
425
-
426
- if "quiz_data" in st.session_state and st.session_state.quiz_data:
427
- quiz_data = st.session_state.quiz_data
428
-
429
- if "quiz_answers" not in st.session_state:
430
- st.session_state.quiz_answers = {}
431
- st.session_state.quiz_submitted = False
432
-
433
- if not st.session_state.quiz_submitted:
434
- for i, question in enumerate(quiz_data):
435
- st.subheader(f"Question {i+1}")
436
- st.write(question["question"])
437
-
438
- options = question["options"]
439
- option_labels = ["A", "B", "C", "D"]
440
-
441
- # Create radio buttons for options
442
- answer = st.radio(
443
- "Select your answer:",
444
- options=option_labels[:len(options)],
445
- key=f"q{i}",
446
- index=None
447
- )
448
-
449
- # Display options
450
- for j, option in enumerate(options):
451
- st.write(f"{option_labels[j]}: {option}")
452
-
453
- st.session_state.quiz_answers[i] = answer
454
- st.divider()
455
-
456
- if st.button("Submit Quiz"):
457
- st.session_state.quiz_submitted = True
458
- st.experimental_rerun()
459
- else:
460
- # Show results
461
- correct_count = 0
462
-
463
- for i, question in enumerate(quiz_data):
464
- st.subheader(f"Question {i+1}")
465
- st.write(question["question"])
466
-
467
- options = question["options"]
468
- option_labels = ["A", "B", "C", "D"]
469
- correct_letter = question["correct_answer"]
470
- user_answer = st.session_state.quiz_answers.get(i)
471
-
472
- # Display options with correct/incorrect indicators
473
- for j, option in enumerate(options):
474
- current_label = option_labels[j]
475
- if current_label == correct_letter:
476
- st.success(f"{current_label}: {option} ✓")
477
- if user_answer == current_label:
478
- correct_count += 1
479
- elif user_answer == current_label:
480
- st.error(f"{current_label}: {option} ✗")
481
- else:
482
- st.write(f"{current_label}: {option}")
483
-
484
- # Show explanation
485
- st.info(f"Explanation: {question['explanation']}")
486
- st.divider()
487
-
488
- st.subheader(f"Your Score: {correct_count}/{len(quiz_data)}")
489
-
490
- if st.button("Retake Quiz"):
491
- st.session_state.quiz_submitted = False
492
- st.session_state.quiz_answers = {}
493
- st.experimental_rerun()
494
- else:
495
- st.info("Generate a quiz from the sidebar to see it here")
496
-
497
- # Tab 3: Assignment
498
- with tab3:
499
- st.header("Assignment")
500
-
501
- if "assignment_data" in st.session_state and st.session_state.assignment_data:
502
- assignment_data = st.session_state.assignment_data
503
-
504
- for i, question in enumerate(assignment_data):
505
- with st.expander(f"Question {i+1}", expanded=True):
506
- st.write(question["question"])
507
-
508
- if "hints" in question and question["hints"]:
509
- st.subheader("Hints")
510
- for hint in question["hints"]:
511
- st.write(f"- {hint}")
512
-
513
- # Input area for answers
514
- st.text_area("Your Answer:", key=f"assignment_q{i}", height=150)
515
-
516
- # Reveal key points button
517
- if st.button("Show Key Points", key=f"key_points_btn_{i}"):
518
- st.subheader("Key Points to Include")
519
- for point in question["key_points"]:
520
- st.write(f"- {point}")
521
- else:
522
- st.info("Generate an assignment from the sidebar to see it here")
523
-
524
- if __name__ == "__main__":
525
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import google.generativeai as genai
5
+ from pypdf import PdfReader
6
+ from pinecone import Pinecone
7
+ import uuid
8
+ import time
9
+ import json
10
+ from dotenv import load_dotenv
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Configuration
17
+ st.set_page_config(page_title="PDF Learning Assistant", layout="wide")
18
+
19
+ # Initialize session state
20
+ if "messages" not in st.session_state:
21
+ st.session_state.messages = []
22
+ if "current_pdf_content" not in st.session_state:
23
+ st.session_state.current_pdf_content = ""
24
+ if "current_pdf_name" not in st.session_state:
25
+ st.session_state.current_pdf_name = ""
26
+ if "index_name" not in st.session_state:
27
+ st.session_state.index_name = "index1" # Using your specific index name
28
+ if "is_initialized" not in st.session_state:
29
+ st.session_state.is_initialized = False
30
+ if "index_dimensions" not in st.session_state:
31
+ st.session_state.index_dimensions = 1024 # Set this based on your Pinecone index
32
+ if "quiz_submitted" not in st.session_state:
33
+ st.session_state.quiz_submitted = False
34
+ if "quiz_answers" not in st.session_state:
35
+ st.session_state.quiz_answers = {}
36
+
37
+ # Functions for PDF processing
38
+ def extract_text_from_pdf(pdf_file):
39
+ reader = PdfReader(pdf_file)
40
+ text = ""
41
+ for page in reader.pages:
42
+ text += page.extract_text() + "\n"
43
+ return text
44
+
45
+ def chunk_text(text, chunk_size=1000, overlap=200):
46
+ chunks = []
47
+ start = 0
48
+ text_length = len(text)
49
+
50
+ while start < text_length:
51
+ end = min(start + chunk_size, text_length)
52
+ if end < text_length and end - start == chunk_size:
53
+ # Find the last period or newline to make more natural chunks
54
+ last_period = text.rfind('.', start, end)
55
+ last_newline = text.rfind('\n', start, end)
56
+ if last_period > start + chunk_size // 2:
57
+ end = last_period + 1
58
+ elif last_newline > start + chunk_size // 2:
59
+ end = last_newline + 1
60
+
61
+ chunks.append(text[start:end])
62
+ start = end - overlap if end < text_length else text_length
63
+
64
+ return chunks
65
+
66
+ # Embeddings and Vector Store functions
67
+ @st.cache_resource
68
+ def get_embedding_model():
69
+ # Using a model that produces 1024-dimensional embeddings
70
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/all-roberta-large-v1")
71
+
72
+
73
+ def initialize_pinecone():
74
+ # Get API key from environment variables
75
+ api_key = os.getenv("PINECONE_API_KEY")
76
+
77
+ if not api_key:
78
+ st.error("Pinecone API key not found. Please add it to your .env file as PINECONE_API_KEY=your_api_key")
79
+ return False
80
+
81
+ try:
82
+ # Initialize Pinecone with your specific configuration
83
+ pc = Pinecone(api_key=api_key)
84
+
85
+ # Store Pinecone client in session state
86
+ st.session_state.pinecone_client = pc
87
+
88
+ # Check if your index exists
89
+ index_list = [idx.name for idx in pc.list_indexes()]
90
+ if st.session_state.index_name not in index_list:
91
+ st.error(f"Index '{st.session_state.index_name}' not found in your Pinecone account.")
92
+ st.info("Available indexes: " + ", ".join(index_list))
93
+ return False
94
+
95
+ # Get index details to check dimensions
96
+ try:
97
+ index = pc.Index(st.session_state.index_name)
98
+ index_stats = index.describe_index_stats()
99
+ if 'dimension' in index_stats:
100
+ st.session_state.index_dimensions = index_stats['dimension']
101
+ st.info(f"Detected index dimension: {st.session_state.index_dimensions}")
102
+ else:
103
+ st.warning("Could not detect index dimensions. Using default: 1024")
104
+ except Exception as e:
105
+ st.warning(f"Could not get index details: {str(e)}")
106
+
107
+ return True
108
+ except Exception as e:
109
+ st.error(f"Error initializing Pinecone: {str(e)}")
110
+ return False
111
+
112
+ def get_pinecone_index():
113
+ # Connect to your existing index
114
+ return st.session_state.pinecone_client.Index(st.session_state.index_name)
115
+
116
+ def embed_chunks(chunks):
117
+ model = get_embedding_model()
118
+ embeddings = []
119
+ for chunk in chunks:
120
+ # HuggingFaceEmbeddings returns a list with a single embedding
121
+ embed = model.embed_documents([chunk])[0]
122
+ embeddings.append(embed)
123
+ return embeddings
124
+
125
+ def store_embeddings(chunks, embeddings, pdf_name):
126
+ index = get_pinecone_index()
127
+ batch_size = 100
128
+
129
+ for i in range(0, len(chunks), batch_size):
130
+ i_end = min(i + batch_size, len(chunks))
131
+ ids = [f"{pdf_name}-{uuid.uuid4()}" for _ in range(i, i_end)]
132
+ metadata = [{"text": chunks[j], "pdf_name": pdf_name, "chunk_id": j} for j in range(i, i_end)]
133
+ vectors = [(ids[j-i], embeddings[j], metadata[j-i]) for j in range(i, i_end)]
134
+
135
+ try:
136
+ index.upsert(vectors=vectors)
137
+ st.success(f"Successfully stored batch {i//batch_size + 1} of chunks to Pinecone")
138
+ except Exception as e:
139
+ st.error(f"Error storing embeddings: {str(e)}")
140
+ # Display the first embedding's dimension for debugging
141
+ if embeddings and len(embeddings) > 0:
142
+ st.info(f"Embedding dimension: {len(embeddings[0])}")
143
+ return False
144
+
145
+ st.success(f"Successfully stored all {len(chunks)} chunks to Pinecone")
146
+ return True
147
+
148
+ def search_similar_chunks(query, top_k=5, pdf_name=None):
149
+ model = get_embedding_model()
150
+ query_embedding = model.embed_query(query)
151
+ index = get_pinecone_index()
152
+
153
+ filter_query = {"pdf_name": pdf_name} if pdf_name else None
154
+
155
+ results = index.query(
156
+ vector=query_embedding,
157
+ top_k=top_k,
158
+ include_metadata=True,
159
+ filter=filter_query
160
+ )
161
+
162
+ return results.matches
163
+
164
+ # Gemini LLM Integration
165
+ @st.cache_resource
166
+ def initialize_gemini():
167
+ api_key = os.getenv("GOOGLE_API_KEY")
168
+
169
+ if not api_key:
170
+ st.error("Google API key not found. Please add it to your .env file as GOOGLE_API_KEY=your_api_key")
171
+ return False
172
+
173
+ try:
174
+ genai.configure(api_key=api_key)
175
+ return True
176
+ except Exception as e:
177
+ st.error(f"Error initializing Google Generative AI: {str(e)}")
178
+ return False
179
+
180
+ def get_gemini_response(prompt, context=None, temperature=0.7):
181
+ try:
182
+ model = genai.GenerativeModel('gemini-2.0-flash')
183
+
184
+ if context:
185
+ full_prompt = f"""
186
+ Context information:
187
+ {context}
188
+
189
+ Question: {prompt}
190
+
191
+ Please provide a helpful, accurate response based on the context information provided.
192
+ If the answer cannot be determined from the context, please state that clearly.
193
+ """
194
+ else:
195
+ full_prompt = prompt
196
+
197
+ response = model.generate_content(full_prompt, generation_config={"temperature": temperature})
198
+ return response.text
199
+ except Exception as e:
200
+ st.error(f"Error getting response from Gemini: {str(e)}")
201
+ return "Sorry, I couldn't generate a response at this time."
202
+
203
+ # Quiz and Assignment Generation
204
+ def generate_quiz(pdf_content, num_questions=5):
205
+ prompt = f"""
206
+ Based on the following content, generate a quiz with {num_questions} multiple-choice questions.
207
+ For each question, provide 4 options and indicate the correct answer.
208
+ Format the response as a JSON array of question objects with the structure:
209
+ [
210
+ {{
211
+ "question": "Question text",
212
+ "options": ["Option A", "Option B", "Option C", "Option D"],
213
+ "correct_answer": "Correct option (A, B, C, or D)",
214
+ "explanation": "Brief explanation of why this is the correct answer"
215
+ }},
216
+ // more questions...
217
+ ]
218
+
219
+ Content: {pdf_content[:2000]}... (truncated for brevity)
220
+ """
221
+
222
+ response = get_gemini_response(prompt, temperature=0.2)
223
+
224
+ try:
225
+ # Extract JSON from response if it's embedded in markdown or text
226
+ if "```json" in response:
227
+ json_start = response.find("```json") + 7
228
+ json_end = response.find("```", json_start)
229
+ json_str = response[json_start:json_end].strip()
230
+ elif "```" in response:
231
+ json_start = response.find("```") + 3
232
+ json_end = response.find("```", json_start)
233
+ json_str = response[json_start:json_end].strip()
234
+ else:
235
+ json_str = response
236
+
237
+ quiz_data = json.loads(json_str)
238
+ return quiz_data
239
+ except Exception as e:
240
+ st.error(f"Error parsing quiz response: {str(e)}")
241
+ return []
242
+
243
+ def generate_assignment(pdf_content, assignment_type="short_answer", num_questions=3):
244
+ prompt = f"""
245
+ Based on the following content, generate a {assignment_type} assignment with {num_questions} questions.
246
+ If the assignment type is 'short_answer', create questions that require brief explanations.
247
+ If the assignment type is 'essay', create deeper questions that require longer responses.
248
+ If the assignment type is 'research', create questions that encourage further exploration of the topics.
249
+
250
+ Format the response as a JSON array with the structure:
251
+ [
252
+ {{
253
+ "question": "Question text",
254
+ "hints": ["Hint 1", "Hint 2"],
255
+ "key_points": ["Key point 1", "Key point 2", "Key point 3"]
256
+ }},
257
+ // more questions...
258
+ ]
259
+
260
+ Content: {pdf_content[:2000]}... (truncated for brevity)
261
+ """
262
+
263
+ response = get_gemini_response(prompt, temperature=0.3)
264
+
265
+ try:
266
+ # Extract JSON from response if it's embedded in markdown or text
267
+ if "```json" in response:
268
+ json_start = response.find("```json") + 7
269
+ json_end = response.find("```", json_start)
270
+ json_str = response[json_start:json_end].strip()
271
+ elif "```" in response:
272
+ json_start = response.find("```") + 3
273
+ json_end = response.find("```", json_start)
274
+ json_str = response[json_start:json_end].strip()
275
+ else:
276
+ json_str = response
277
+
278
+ assignment_data = json.loads(json_str)
279
+ return assignment_data
280
+ except Exception as e:
281
+ st.error(f"Error parsing assignment response: {str(e)}")
282
+ return []
283
+
284
+ # Callback functions for quiz submission and reset
285
+ def submit_quiz():
286
+ st.session_state.quiz_submitted = True
287
+
288
+ def reset_quiz():
289
+ st.session_state.quiz_submitted = False
290
+ st.session_state.quiz_answers = {}
291
+
292
+ # Streamlit UI
293
+ def main():
294
+ st.title("📚 PDF Learning Assistant")
295
+
296
+ # Initialize services
297
+ if not st.session_state.is_initialized:
298
+ with st.spinner("Initializing services..."):
299
+ pinecone_init = initialize_pinecone()
300
+ gemini_init = initialize_gemini()
301
+
302
+ if pinecone_init and gemini_init:
303
+ st.session_state.is_initialized = True
304
+ st.success("Services initialized successfully!")
305
+
306
+ # Display Pinecone connection info
307
+ st.info(f"""
308
+ Connected to Pinecone index:
309
+ - Index name: {st.session_state.index_name}
310
+ - Dimension: {st.session_state.index_dimensions}
311
+ - Host: https://index1-mwog0w0.svc.aped-4627-b74a.pinecone.io
312
+ - Region: us-east-1
313
+ - Type: Dense
314
+ - Capacity: Serverless
315
+ """)
316
+ else:
317
+ st.error("Failed to initialize all required services. Please check your API keys in the .env file.")
318
+
319
+ # Show .env file template
320
+ st.code("""
321
+ # Create a .env file in the same directory with the following content:
322
+ PINECONE_API_KEY=your_pinecone_api_key
323
+ GOOGLE_API_KEY=your_google_api_key
324
+ """)
325
+ return
326
+
327
+ # Sidebar for PDF upload and main actions
328
+ with st.sidebar:
329
+ st.header("Upload PDF")
330
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
331
+
332
+ if uploaded_file:
333
+ with st.spinner("Processing PDF..."):
334
+ # Save uploaded file to temp location
335
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
336
+ temp_file.write(uploaded_file.getvalue())
337
+ temp_path = temp_file.name
338
+
339
+ # Extract text
340
+ pdf_text = extract_text_from_pdf(temp_path)
341
+ os.unlink(temp_path) # Delete temp file
342
+
343
+ # Store in session state
344
+ st.session_state.current_pdf_content = pdf_text
345
+ st.session_state.current_pdf_name = uploaded_file.name
346
+
347
+ # Chunk and embed
348
+ chunks = chunk_text(pdf_text)
349
+ embeddings = embed_chunks(chunks)
350
+
351
+ # Store in Pinecone
352
+ success = store_embeddings(chunks, embeddings, st.session_state.current_pdf_name)
353
+
354
+ if success:
355
+ st.success(f"Successfully processed {uploaded_file.name}")
356
+ else:
357
+ st.error(f"Failed to process {uploaded_file.name}")
358
+
359
+ st.divider()
360
+ st.header("Learning Tools")
361
+
362
+ # Only enable these buttons if a PDF is loaded
363
+ if st.session_state.current_pdf_content:
364
+ # Quiz generation
365
+ quiz_questions = st.slider("Number of quiz questions", min_value=3, max_value=10, value=5)
366
+ if st.button("Generate Quiz"):
367
+ with st.spinner("Generating quiz..."):
368
+ quiz_data = generate_quiz(st.session_state.current_pdf_content, num_questions=quiz_questions)
369
+ st.session_state.quiz_data = quiz_data
370
+ # Reset quiz state when generating a new quiz
371
+ reset_quiz()
372
+
373
+ # Assignment generation
374
+ assignment_type = st.selectbox(
375
+ "Assignment Type",
376
+ ["short_answer", "essay", "research"]
377
+ )
378
+ assignment_questions = st.slider("Number of assignment questions", min_value=1, max_value=5, value=3)
379
+
380
+ if st.button("Generate Assignment"):
381
+ with st.spinner("Generating assignment..."):
382
+ assignment_data = generate_assignment(
383
+ st.session_state.current_pdf_content,
384
+ assignment_type,
385
+ num_questions=assignment_questions
386
+ )
387
+ st.session_state.assignment_data = assignment_data
388
+ else:
389
+ st.info("Please upload a PDF first to use these features")
390
+
391
+ # Main content area
392
+ tab1, tab2, tab3 = st.tabs(["Chatbot", "Quiz", "Assignment"])
393
+
394
+ # Tab 1: Chatbot
395
+ with tab1:
396
+ st.header("Chat with your PDF")
397
+
398
+ # Display chat messages
399
+ for message in st.session_state.messages:
400
+ with st.chat_message(message["role"]):
401
+ st.write(message["content"])
402
+
403
+ # Chat input
404
+ if st.session_state.current_pdf_content:
405
+ user_input = st.chat_input("Ask a question about your PDF...")
406
+
407
+ if user_input:
408
+ # Add user message to chat history
409
+ st.session_state.messages.append({"role": "user", "content": user_input})
410
+ with st.chat_message("user"):
411
+ st.write(user_input)
412
+
413
+ # Generate response
414
+ with st.chat_message("assistant"):
415
+ with st.spinner("Thinking..."):
416
+ # Search for relevant context
417
+ similar_chunks = search_similar_chunks(
418
+ user_input,
419
+ top_k=3,
420
+ pdf_name=st.session_state.current_pdf_name
421
+ )
422
+
423
+ # Extract text from results
424
+ context = "\n\n".join([match.metadata["text"] for match in similar_chunks])
425
+
426
+ # Get response from Gemini
427
+ response = get_gemini_response(user_input, context)
428
+
429
+ st.write(response)
430
+
431
+ # Add assistant message to chat history
432
+ st.session_state.messages.append({"role": "assistant", "content": response})
433
+ else:
434
+ st.info("Please upload a PDF to start chatting")
435
+
436
+ # Tab 2: Quiz
437
+ with tab2:
438
+ st.header("Quiz")
439
+
440
+ if "quiz_data" in st.session_state and st.session_state.quiz_data:
441
+ quiz_data = st.session_state.quiz_data
442
+
443
+ # Quiz display logic - static until submitted
444
+ if not st.session_state.quiz_submitted:
445
+ # Quiz form
446
+ with st.form(key="quiz_form"):
447
+ for i, question in enumerate(quiz_data):
448
+ st.subheader(f"Question {i+1}")
449
+ st.write(question["question"])
450
+
451
+ options = question["options"]
452
+ option_labels = ["A", "B", "C", "D"]
453
+
454
+ # Create radio buttons for options
455
+ answer = st.radio(
456
+ "Select your answer:",
457
+ options=option_labels[:len(options)],
458
+ key=f"q{i}",
459
+ index=None
460
+ )
461
+
462
+ # Display options
463
+ for j, option in enumerate(options):
464
+ st.write(f"{option_labels[j]}: {option}")
465
+
466
+ # Store answer in session state
467
+ if answer:
468
+ st.session_state.quiz_answers[i] = answer
469
+
470
+ st.divider()
471
+
472
+ # Submit button inside the form
473
+ submit_button = st.form_submit_button("Submit Quiz")
474
+ if submit_button:
475
+ st.session_state.quiz_submitted = True
476
+ else:
477
+ # Show results after submission
478
+ correct_count = 0
479
+
480
+ for i, question in enumerate(quiz_data):
481
+ st.subheader(f"Question {i+1}")
482
+ st.write(question["question"])
483
+
484
+ options = question["options"]
485
+ option_labels = ["A", "B", "C", "D"]
486
+ correct_letter = question["correct_answer"]
487
+ user_answer = st.session_state.quiz_answers.get(i)
488
+
489
+ # Display options with correct/incorrect indicators
490
+ for j, option in enumerate(options):
491
+ current_label = option_labels[j]
492
+ if current_label == correct_letter:
493
+ st.success(f"{current_label}: {option} ✓")
494
+ if user_answer == current_label:
495
+ correct_count += 1
496
+ elif user_answer == current_label:
497
+ st.error(f"{current_label}: {option} ✗")
498
+ else:
499
+ st.write(f"{current_label}: {option}")
500
+
501
+ # Show explanation
502
+ st.info(f"Explanation: {question['explanation']}")
503
+ st.divider()
504
+
505
+ st.subheader(f"Your Score: {correct_count}/{len(quiz_data)}")
506
+
507
+ if st.button("Retake Quiz"):
508
+ reset_quiz()
509
+ else:
510
+ st.info("Generate a quiz from the sidebar to see it here")
511
+
512
+ # Tab 3: Assignment
513
+ # Tab 3: Assignment
514
+ with tab3:
515
+ st.header("Assignment")
516
+
517
+ if "assignment_data" in st.session_state and st.session_state.assignment_data:
518
+ assignment_data = st.session_state.assignment_data
519
+
520
+ # Create a form for the assignment
521
+ with st.form(key="assignment_form"):
522
+ for i, question in enumerate(assignment_data):
523
+ st.subheader(f"Question {i+1}")
524
+ st.write(question["question"])
525
+
526
+ # Use a checkbox to toggle hints instead of a nested expander
527
+ if "hints" in question and question["hints"]:
528
+ show_hints = st.checkbox(f"Show hints for Question {i+1}", key=f"hint_checkbox_{i}")
529
+ if show_hints:
530
+ for hint in question["hints"]:
531
+ st.write(f"- {hint}")
532
+
533
+ # Input area for the answer
534
+ st.text_area("Your Answer:", key=f"assignment_q{i}", height=150)
535
+ st.divider()
536
+
537
+ # Add the submit button as a direct child of the form
538
+ submit_assignment = st.form_submit_button("Submit Assignment")
539
+
540
+ # Process form submission outside the form block
541
+ if submit_assignment:
542
+ st.success("Assignment submitted! Here are the key points for each question:")
543
+ for i, question in enumerate(assignment_data):
544
+ with st.expander(f"Key Points for Question {i+1}", expanded=True):
545
+ for point in question["key_points"]:
546
+ st.write(f"- {point}")
547
+ else:
548
+ st.info("Generate an assignment from the sidebar to see it here")
549
+
550
+
551
+ if __name__ == "__main__":
552
+ main()