ahm14 commited on
Commit
efbebca
·
verified ·
1 Parent(s): ca26c78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +263 -263
app.py CHANGED
@@ -1,263 +1,263 @@
1
- import streamlit as st
2
- from langchain_groq import ChatGroq
3
- from langchain_core.output_parsers import StrOutputParser
4
- from langchain_core.prompts import ChatPromptTemplate
5
- from dotenv import load_dotenv
6
- import os
7
- import pytesseract
8
- from PIL import Image
9
- import pdfplumber
10
- import docx
11
- from io import BytesIO
12
- from sentence_transformers import SentenceTransformer
13
- from pinecone import Pinecone, ServerlessSpec
14
- import logging
15
-
16
- # Load environment variables
17
- load_dotenv()
18
-
19
- # Initialize logging
20
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
-
22
- # Initialize LLM
23
- llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
24
-
25
- # Initialize Pinecone for vector storage
26
- PINECONE_API_KEY = "pcsk_6PtxDh_6tortuWyNhXdmVrAjx1ZSv8bQRcbgbE7j3JtwwcpMCkFfdsp6VC925WxmqpNYQC"
27
- pc = Pinecone(api_key=PINECONE_API_KEY)
28
-
29
- cloud = os.getenv('PINECONE_CLOUD', 'aws')
30
- region = os.getenv('PINECONE_REGION', 'us-east-1')
31
-
32
- spec = ServerlessSpec(cloud=cloud, region=region)
33
-
34
- index_name = "syllabus-index"
35
- if index_name not in pc.list_indexes().names():
36
- pc.create_index(
37
- name=index_name,
38
- dimension=384,
39
- spec=spec
40
- )
41
-
42
- index = pc.Index(index_name)
43
-
44
- # Initialize embedding model
45
- embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
-
47
- # OCR Configuration for Pytesseract
48
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Adjust to your system's path
49
-
50
- # Function to extract text, images, tables, and formulas from PDF
51
- def extract_pdf_data(pdf_path):
52
- data = {"text": "", "tables": [], "images": []}
53
- try:
54
- with pdfplumber.open(pdf_path) as pdf:
55
- for page in pdf.pages:
56
- # Extract Text
57
- data["text"] += page.extract_text() or ""
58
- # Extract Tables
59
- tables = page.extract_tables()
60
- for table in tables:
61
- data["tables"].append(table)
62
- # Extract Images
63
- for image in page.images:
64
- base_image = pdf.extract_image(image["object_number"])
65
- image_obj = Image.open(BytesIO(base_image["image"]))
66
- data["images"].append(image_obj)
67
- except Exception as e:
68
- logging.error(f"Error processing PDF: {e}")
69
- return data
70
-
71
- # Function to extract text from DOCX files
72
- def extract_docx_data(docx_file):
73
- doc = docx.Document(docx_file)
74
- text = ""
75
- for para in doc.paragraphs:
76
- text += para.text + "\n"
77
- return text
78
-
79
- # Function to extract text from plain text files
80
- def extract_text_file_data(text_file):
81
- return text_file.read().decode('utf-8')
82
-
83
- # Function to extract text from images using OCR
84
- def extract_text_from_images(images):
85
- ocr_text = ""
86
- for image in images:
87
- ocr_text += pytesseract.image_to_string(image) + "\n"
88
- return ocr_text
89
-
90
- # Function to process extracted content (PDF, DOCX, etc.)
91
- def process_content(file_data, file_type="pdf"):
92
- text = ""
93
- images = []
94
- if file_type == "pdf":
95
- pdf_data = extract_pdf_data(file_data)
96
- text = process_pdf_content(pdf_data)
97
- images = pdf_data["images"]
98
- elif file_type == "docx":
99
- text = extract_docx_data(file_data)
100
- elif file_type == "txt":
101
- text = extract_text_file_data(file_data)
102
-
103
- ocr_text = extract_text_from_images(images)
104
- return text + "\n" + ocr_text
105
-
106
- # Function to process PDF content
107
- def process_pdf_content(pdf_data):
108
- # Process OCR text from images
109
- ocr_text = extract_text_from_images(pdf_data["images"])
110
- combined_text = pdf_data["text"] + ocr_text
111
-
112
- # Process tables into readable text
113
- table_text = ""
114
- for table in pdf_data["tables"]:
115
- table_rows = [" | ".join(row) for row in table]
116
- table_text += "\n".join(table_rows) + "\n"
117
-
118
- return combined_text + "\n" + table_text
119
-
120
- # Function to add syllabus to vector database
121
- def add_syllabus_to_index(syllabus_text):
122
- sentences = syllabus_text.split(". ")
123
- embeddings = embedder.encode(sentences, batch_size=32, show_progress_bar=True)
124
- for i, sentence in enumerate(sentences):
125
- index.upsert([(f"sentence-{i}", embeddings[i].tolist(), {"text": sentence})])
126
-
127
- # Function to retrieve relevant syllabus content
128
- def retrieve_relevant_content(query):
129
- try:
130
- query_embedding = embedder.encode([query])
131
- results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True)
132
- relevant_content = "\n".join([match["metadata"]["text"] for match in results["matches"]])
133
- return relevant_content
134
- except Exception as e:
135
- logging.error(f"Error retrieving content: {e}")
136
- return ""
137
-
138
- # Function to generate questions
139
- def generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty_level):
140
- prompt_template = f"""
141
- Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
142
-
143
- Subject: {subject_name}
144
- Syllabus Content: {syllabus_context}
145
-
146
- Difficulty Levels:
147
- - Remember: {difficulty_level.get('Remember', 0)}
148
- - Understand: {difficulty_level.get('Understand', 0)}
149
- - Apply: {difficulty_level.get('Apply', 0)}
150
- - Analyze: {difficulty_level.get('Analyze', 0)}
151
- - Evaluate: {difficulty_level.get('Evaluate', 0)}
152
- - Create: {difficulty_level.get('Create', 0)}
153
-
154
- Format questions as follows:
155
- Q1. ________________
156
-
157
- Q2. ________________
158
-
159
- ...
160
- """
161
- chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
162
- try:
163
- return chain.invoke({})
164
- except Exception as e:
165
- logging.error(f"Error generating {question_type} questions: {e}")
166
- return ""
167
-
168
- # Function to generate answers
169
- def generate_answers(questions, syllabus_context):
170
- prompt = f"""
171
- Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
172
-
173
- Syllabus Content: {syllabus_context}
174
-
175
- Questions:
176
- {questions}
177
-
178
- Format answers as follows:
179
- Answer 1: ________________
180
- Answer 2: ________________
181
- ...
182
- """
183
- chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
184
- try:
185
- return chain.invoke({})
186
- except Exception as e:
187
- logging.error(f"Error generating answers: {e}")
188
- return ""
189
-
190
- # Streamlit app
191
- st.title("Bloom Taxonomy Based Exam Paper Developer")
192
-
193
- # Sidebar inputs
194
- instructor_name = st.sidebar.text_input("Instructor")
195
- class_name = st.sidebar.text_input("Class")
196
- institution_name = st.sidebar.text_input("Institution")
197
- subject_name = st.sidebar.text_input("Subject")
198
-
199
- # Syllabus Upload
200
- uploaded_file = st.sidebar.file_uploader("Upload Syllabus (PDF, DOCX, TXT, Image)", type=["pdf", "docx", "txt", "png", "jpg"])
201
- syllabus_text = None
202
- if uploaded_file:
203
- file_type = uploaded_file.type.split("/")[1]
204
- st.sidebar.markdown("✅ Syllabus uploaded")
205
- syllabus_text = process_content(uploaded_file, file_type)
206
- add_syllabus_to_index(syllabus_text)
207
-
208
- # Preview of Syllabus
209
- if syllabus_text:
210
- st.subheader("Syllabus Preview:")
211
- st.text_area("Extracted Content", syllabus_text[:1000], height=300)
212
-
213
- # Question Type Selection
214
- question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
215
- difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
216
- difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
217
- num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
218
-
219
- # Instructor Feedback Option
220
- feedback = st.sidebar.text_area("Instructor Feedback (Optional)")
221
-
222
- # Generate Questions
223
- if st.sidebar.button("Generate Questions"):
224
- if syllabus_text:
225
- with st.spinner(f"Generating {question_type}..."):
226
- syllabus_context = retrieve_relevant_content(f"Generate {question_type} based on syllabus")
227
- st.session_state.generated_questions = generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty)
228
- st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
229
- else:
230
- st.error("Please upload a syllabus before generating questions.")
231
-
232
- # Generate Answers
233
- if st.sidebar.button("Generate Answers for Questions"):
234
- if "generated_questions" in st.session_state and st.session_state.generated_questions:
235
- with st.spinner("Generating answers..."):
236
- syllabus_context = retrieve_relevant_content("Generate answers from syllabus")
237
- st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
238
- st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
239
- else:
240
- st.error("Generate questions first before generating answers.")
241
-
242
- # Download Options
243
- if "generated_questions" in st.session_state and st.session_state.generated_questions:
244
- st.sidebar.download_button(
245
- label="Download Questions",
246
- data=st.session_state.generated_questions,
247
- file_name=f"{subject_name}_questions.txt",
248
- mime="text/plain",
249
- )
250
-
251
- if "generated_answers" in st.session_state and st.session_state.generated_answers:
252
- st.sidebar.download_button(
253
- label="Download Answers",
254
- data=st.session_state.generated_answers,
255
- file_name=f"{subject_name}_answers.txt",
256
- mime="text/plain",
257
- )
258
-
259
- # Application Footer
260
- st.markdown("""
261
- ---
262
- **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
263
- """)
 
1
+ import streamlit as st
2
+ from langchain_groq import ChatGroq
3
+ from langchain_core.output_parsers import StrOutputParser
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import pytesseract
8
+ from PIL import Image
9
+ import pdfplumber
10
+ import docx
11
+ from io import BytesIO
12
+ from sentence_transformers import SentenceTransformer
13
+ from pinecone import Pinecone, ServerlessSpec
14
+ import logging
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Initialize logging
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+
22
+ # Initialize LLM
23
+ llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
24
+
25
+ # Initialize Pinecone for vector storage
26
+ PINECONE_API_KEY = "pcsk_6PtxDh_6tortuWyNhXdmVrAjx1ZSv8bQRcbgbE7j3JtwwcpMCkFfdsp6VC925WxmqpNYQC"
27
+ pc = Pinecone(api_key=PINECONE_API_KEY)
28
+
29
+ cloud = os.getenv('PINECONE_CLOUD', 'aws')
30
+ region = os.getenv('PINECONE_REGION', 'us-east-1')
31
+
32
+ spec = ServerlessSpec(cloud=cloud, region=region)
33
+
34
+ index_name = "syllabus-index"
35
+ if index_name not in pc.list_indexes().names():
36
+ pc.create_index(
37
+ name=index_name,
38
+ dimension=384,
39
+ spec=spec
40
+ )
41
+
42
+ index = pc.Index(index_name)
43
+
44
+ # Initialize embedding model
45
+ embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
+
47
+ # OCR Configuration for Pytesseract
48
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Adjust to your system's path
49
+
50
+ # Function to extract text, images, tables, and formulas from PDF
51
+ def extract_pdf_data(pdf_path):
52
+ data = {"text": "", "tables": [], "images": []}
53
+ try:
54
+ with pdfplumber.open(pdf_path) as pdf:
55
+ for page in pdf.pages:
56
+ # Extract Text
57
+ data["text"] += page.extract_text() or ""
58
+ # Extract Tables
59
+ tables = page.extract_tables()
60
+ for table in tables:
61
+ data["tables"].append(table)
62
+ # Extract Images
63
+ for image in page.images:
64
+ base_image = pdf.extract_image(image["object_number"])
65
+ image_obj = Image.open(BytesIO(base_image["image"]))
66
+ data["images"].append(image_obj)
67
+ except Exception as e:
68
+ logging.error(f"Error processing PDF: {e}")
69
+ return data
70
+
71
+ # Function to extract text from DOCX files
72
+ def extract_docx_data(docx_file):
73
+ doc = docx.Document(docx_file)
74
+ text = ""
75
+ for para in doc.paragraphs:
76
+ text += para.text + "\n"
77
+ return text
78
+
79
+ # Function to extract text from plain text files
80
+ def extract_text_file_data(text_file):
81
+ return text_file.read().decode('utf-8')
82
+
83
+ # Function to extract text from images using OCR
84
+ def extract_text_from_images(images):
85
+ ocr_text = ""
86
+ for image in images:
87
+ ocr_text += pytesseract.image_to_string(image) + "\n"
88
+ return ocr_text
89
+
90
+ # Function to process extracted content (PDF, DOCX, etc.)
91
+ def process_content(file_data, file_type="pdf"):
92
+ text = ""
93
+ images = []
94
+ if file_type == "pdf":
95
+ pdf_data = extract_pdf_data(file_data)
96
+ text = process_pdf_content(pdf_data)
97
+ images = pdf_data["images"]
98
+ elif file_type == "docx":
99
+ text = extract_docx_data(file_data)
100
+ elif file_type == "txt":
101
+ text = extract_text_file_data(file_data)
102
+
103
+ ocr_text = extract_text_from_images(images)
104
+ return text + "\n" + ocr_text
105
+
106
+ # Function to process PDF content
107
+ def process_pdf_content(pdf_data):
108
+ # Process OCR text from images
109
+ ocr_text = extract_text_from_images(pdf_data["images"])
110
+ combined_text = pdf_data["text"] + ocr_text
111
+
112
+ # Process tables into readable text
113
+ table_text = ""
114
+ for table in pdf_data["tables"]:
115
+ table_rows = [" | ".join(row) for row in table]
116
+ table_text += "\n".join(table_rows) + "\n"
117
+
118
+ return combined_text + "\n" + table_text
119
+
120
+ # Function to add syllabus to vector database
121
+ def add_syllabus_to_index(syllabus_text):
122
+ sentences = syllabus_text.split(". ")
123
+ embeddings = embedder.encode(sentences, batch_size=32, show_progress_bar=True)
124
+ for i, sentence in enumerate(sentences):
125
+ index.upsert([(f"sentence-{i}", embeddings[i].tolist(), {"text": sentence})])
126
+
127
+ # Function to retrieve relevant syllabus content
128
+ def retrieve_relevant_content(query):
129
+ try:
130
+ query_embedding = embedder.encode([query])
131
+ results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True)
132
+ relevant_content = "\n".join([match["metadata"]["text"] for match in results["matches"]])
133
+ return relevant_content
134
+ except Exception as e:
135
+ logging.error(f"Error retrieving content: {e}")
136
+ return ""
137
+
138
+ # Function to generate questions
139
+ def generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty_level):
140
+ prompt_template = f"""
141
+ Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
142
+
143
+ Subject: {subject_name}
144
+ Syllabus Content: {syllabus_context}
145
+
146
+ Difficulty Levels:
147
+ - Remember: {difficulty_level.get('Remember', 0)}
148
+ - Understand: {difficulty_level.get('Understand', 0)}
149
+ - Apply: {difficulty_level.get('Apply', 0)}
150
+ - Analyze: {difficulty_level.get('Analyze', 0)}
151
+ - Evaluate: {difficulty_level.get('Evaluate', 0)}
152
+ - Create: {difficulty_level.get('Create', 0)}
153
+
154
+ Format questions as follows:
155
+ Q1. ________________
156
+
157
+ Q2. ________________
158
+
159
+ ...
160
+ """
161
+ chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
162
+ try:
163
+ return chain.invoke({})
164
+ except Exception as e:
165
+ logging.error(f"Error generating {question_type} questions: {e}")
166
+ return ""
167
+
168
+ # Function to generate answers
169
+ def generate_answers(questions, syllabus_context):
170
+ prompt = f"""
171
+ Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
172
+
173
+ Syllabus Content: {syllabus_context}
174
+
175
+ Questions:
176
+ {questions}
177
+
178
+ Format answers as follows:
179
+ Answer 1: ________________
180
+ Answer 2: ________________
181
+ ...
182
+ """
183
+ chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
184
+ try:
185
+ return chain.invoke({})
186
+ except Exception as e:
187
+ logging.error(f"Error generating answers: {e}")
188
+ return ""
189
+
190
+ # Streamlit app
191
+ st.title("Bloom's Taxonomy Based Exam Paper Developer")
192
+
193
+ # Sidebar inputs
194
+ instructor_name = st.sidebar.text_input("Instructor")
195
+ class_name = st.sidebar.text_input("Class")
196
+ institution_name = st.sidebar.text_input("Institution")
197
+ subject_name = st.sidebar.text_input("Subject")
198
+
199
+ # Syllabus Upload
200
+ uploaded_file = st.sidebar.file_uploader("Upload Syllabus (PDF, DOCX, TXT, Image)", type=["pdf", "docx", "txt", "png", "jpg"])
201
+ syllabus_text = None
202
+ if uploaded_file:
203
+ file_type = uploaded_file.type.split("/")[1]
204
+ st.sidebar.markdown("✅ Syllabus uploaded")
205
+ syllabus_text = process_content(uploaded_file, file_type)
206
+ add_syllabus_to_index(syllabus_text)
207
+
208
+ # Preview of Syllabus
209
+ if syllabus_text:
210
+ st.subheader("Syllabus Preview:")
211
+ st.text_area("Extracted Content", syllabus_text[:1000], height=300)
212
+
213
+ # Question Type Selection
214
+ question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
215
+ difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
216
+ difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
217
+ num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
218
+
219
+ # Instructor Feedback Option
220
+ feedback = st.sidebar.text_area("Instructor Feedback (Optional)")
221
+
222
+ # Generate Questions
223
+ if st.sidebar.button("Generate Questions"):
224
+ if syllabus_text:
225
+ with st.spinner(f"Generating {question_type}..."):
226
+ syllabus_context = retrieve_relevant_content(f"Generate {question_type} based on syllabus")
227
+ st.session_state.generated_questions = generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty)
228
+ st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
229
+ else:
230
+ st.error("Please upload a syllabus before generating questions.")
231
+
232
+ # Generate Answers
233
+ if st.sidebar.button("Generate Answers for Questions"):
234
+ if "generated_questions" in st.session_state and st.session_state.generated_questions:
235
+ with st.spinner("Generating answers..."):
236
+ syllabus_context = retrieve_relevant_content("Generate answers from syllabus")
237
+ st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
238
+ st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
239
+ else:
240
+ st.error("Generate questions first before generating answers.")
241
+
242
+ # Download Options
243
+ if "generated_questions" in st.session_state and st.session_state.generated_questions:
244
+ st.sidebar.download_button(
245
+ label="Download Questions",
246
+ data=st.session_state.generated_questions,
247
+ file_name=f"{subject_name}_questions.txt",
248
+ mime="text/plain",
249
+ )
250
+
251
+ if "generated_answers" in st.session_state and st.session_state.generated_answers:
252
+ st.sidebar.download_button(
253
+ label="Download Answers",
254
+ data=st.session_state.generated_answers,
255
+ file_name=f"{subject_name}_answers.txt",
256
+ mime="text/plain",
257
+ )
258
+
259
+ # Application Footer
260
+ st.markdown("""
261
+ ---
262
+ **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
263
+ """)