ahm14 commited on
Commit
4973d3f
·
verified ·
1 Parent(s): 2e7fff8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +268 -0
app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_groq import ChatGroq
3
+ from langchain_core.output_parsers import StrOutputParser
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from dotenv import load_dotenv
6
+ import pytesseract
7
+ from PIL import Image
8
+ import pdfplumber
9
+ import docx
10
+ from io import BytesIO
11
+ import logging
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from streamlit.runtime.caching import cache_data
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Initialize logging
19
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
+
21
+ # Initialize LLM
22
+ llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
23
+
24
+ # OCR Configuration for Pytesseract
25
+ pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
26
+
27
+ # Enhanced OCR with configurable language option
28
+ def extract_text_from_images(images, lang="eng"):
29
+ ocr_text = ""
30
+ for image in images:
31
+ try:
32
+ ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
33
+ except Exception as e:
34
+ logging.error(f"Error in OCR: {e}")
35
+ return ocr_text.strip()
36
+
37
+ # Function to extract data from PDF with parallelization
38
+ @cache_data
39
+ def extract_pdf_data(pdf_path):
40
+ data = {"text": "", "tables": [], "images": []}
41
+ try:
42
+ with pdfplumber.open(pdf_path) as pdf:
43
+ def process_page(page):
44
+ page_data = {"text": page.extract_text() or "", "tables": [], "images": []}
45
+ tables = page.extract_tables()
46
+ for table in tables:
47
+ page_data["tables"].append(table)
48
+ for image in page.images:
49
+ base_image = pdf.extract_image(image["object_number"])
50
+ image_obj = Image.open(BytesIO(base_image["image"]))
51
+ page_data["images"].append(image_obj)
52
+ return page_data
53
+
54
+ with ThreadPoolExecutor() as executor:
55
+ pages_data = list(executor.map(process_page, pdf.pages))
56
+
57
+ for page_data in pages_data:
58
+ data["text"] += page_data["text"]
59
+ data["tables"].extend(page_data["tables"])
60
+ data["images"].extend(page_data["images"])
61
+ except Exception as e:
62
+ logging.error(f"Error processing PDF: {e}")
63
+ return data
64
+
65
+ # Function to extract text from DOCX files
66
+ @cache_data
67
+ def extract_docx_data(docx_file):
68
+ try:
69
+ doc = docx.Document(docx_file)
70
+ text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
71
+ return text
72
+ except Exception as e:
73
+ logging.error(f"Error extracting DOCX content: {e}")
74
+ return ""
75
+
76
+ # Function to extract text from plain text files
77
+ @cache_data
78
+ def extract_text_file_data(text_file):
79
+ try:
80
+ return text_file.read().decode("utf-8").strip()
81
+ except Exception as e:
82
+ logging.error(f"Error extracting TXT content: {e}")
83
+ return ""
84
+
85
+ # Function to process extracted content (PDF, DOCX, etc.)
86
+ def process_content(file_data, file_type, lang="eng"):
87
+ text = ""
88
+ images = []
89
+ if file_type == "pdf":
90
+ pdf_data = extract_pdf_data(file_data)
91
+ text = process_pdf_content(pdf_data)
92
+ images = pdf_data["images"]
93
+ elif file_type == "docx":
94
+ text = extract_docx_data(file_data)
95
+ elif file_type == "txt":
96
+ text = extract_text_file_data(file_data)
97
+ elif file_type in ["png", "jpg", "jpeg"]:
98
+ image = Image.open(file_data)
99
+ images.append(image)
100
+
101
+ ocr_text = extract_text_from_images(images, lang)
102
+ return text + "\n" + ocr_text
103
+
104
+ # Function to process PDF content
105
+ def process_pdf_content(pdf_data):
106
+ ocr_text = extract_text_from_images(pdf_data["images"])
107
+ combined_text = pdf_data["text"] + ocr_text
108
+
109
+ table_text = ""
110
+ for table in pdf_data["tables"]:
111
+ table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
112
+ table_text += "\n".join(table_rows) + "\n"
113
+
114
+ return (combined_text + "\n" + table_text).strip()
115
+
116
+ # Function to generate questions
117
+ def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
118
+ prompt_template = f"""
119
+ Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
120
+ Subject: {subject_name}
121
+ Instructor: {instructor}
122
+ Class: {class_name}
123
+ Institution: {institution}
124
+ Syllabus Content: {syllabus_context}
125
+ Difficulty Levels:
126
+ - Remember: {difficulty_level.get('Remember', 0)}
127
+ - Understand: {difficulty_level.get('Understand', 0)}
128
+ - Apply: {difficulty_level.get('Apply', 0)}
129
+ - Analyze: {difficulty_level.get('Analyze', 0)}
130
+ - Evaluate: {difficulty_level.get('Evaluate', 0)}
131
+ - Create: {difficulty_level.get('Create', 0)}
132
+ Format questions as follows:
133
+ Q1. ________________
134
+ Q2. ________________
135
+ ...
136
+ """
137
+ chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
138
+ try:
139
+ return chain.invoke({})
140
+ except Exception as e:
141
+ logging.error(f"Error generating {question_type} questions: {e}")
142
+ return ""
143
+
144
+ # Function to generate answers
145
+ def generate_answers(questions, syllabus_context):
146
+ prompt = f"""
147
+ Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
148
+ Syllabus Content: {syllabus_context}
149
+ Questions:
150
+ {questions}
151
+ Format answers as follows:
152
+ Answer 1: ________________
153
+ Answer 2: ________________
154
+ ...
155
+ """
156
+ chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
157
+ try:
158
+ return chain.invoke({})
159
+ except Exception as e:
160
+ logging.error(f"Error generating answers: {e}")
161
+ return ""
162
+
163
+ # Streamlit app
164
+ st.title("Bloom's Taxonomy Based Exam Paper Developer")
165
+
166
+ # Sidebar Clear Data Button
167
+ if st.sidebar.button("Clear All Data"):
168
+ st.session_state.clear()
169
+ st.success("All data has been cleared. You can now upload a new syllabus.")
170
+
171
+ # File Upload with Image Support
172
+ uploaded_file = st.sidebar.file_uploader(
173
+ "Upload Syllabus (PDF, DOCX, TXT, Image)",
174
+ type=["pdf", "docx", "txt", "png", "jpg", "jpeg"]
175
+ )
176
+
177
+ # Sidebar Inputs for Subject Name, Instructor, Class, and Institution
178
+ subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
179
+ instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
180
+ class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
181
+ institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
182
+
183
+ # Language Option for OCR
184
+ ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
185
+
186
+ if uploaded_file:
187
+ # Clear session state when a new file is uploaded
188
+ if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
189
+ st.session_state.clear()
190
+ st.success("Previous data cleared. Processing new file...")
191
+
192
+ st.session_state.uploaded_filename = uploaded_file.name
193
+ file_type = uploaded_file.type.split("/")[-1]
194
+
195
+ # Validate file type
196
+ if file_type not in ["pdf", "docx", "txt", "png", "jpg", "jpeg"]:
197
+ st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
198
+ else:
199
+ syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
200
+ st.session_state.syllabus_text = syllabus_text
201
+
202
+ # Preview of Syllabus
203
+ if "syllabus_text" in st.session_state:
204
+ st.subheader("Syllabus Preview:")
205
+ st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
206
+ else:
207
+ st.warning("Please upload a syllabus to begin.")
208
+
209
+ # Question Type Selection
210
+ question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
211
+ difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
212
+ difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
213
+ num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
214
+
215
+ if st.sidebar.button("Generate Questions"):
216
+ if "syllabus_text" in st.session_state:
217
+ with st.spinner(f"Generating {question_type}..."):
218
+ syllabus_context = st.session_state.syllabus_text
219
+ st.session_state.generated_questions = generate_questions(
220
+ question_type,
221
+ subject_name,
222
+ instructor_name,
223
+ class_name,
224
+ institution_name,
225
+ syllabus_context,
226
+ num_questions,
227
+ difficulty,
228
+ )
229
+ st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
230
+ else:
231
+ st.error("Please upload a syllabus before generating questions.")
232
+
233
+ # Button to generate answers for questions
234
+ if st.sidebar.button("Generate Answers for Questions"):
235
+ if "generated_questions" in st.session_state:
236
+ with st.spinner("Generating answers..."):
237
+ syllabus_context = st.session_state.syllabus_text
238
+ st.session_state.generated_answers = generate_answers(
239
+ st.session_state.generated_questions, syllabus_context
240
+ )
241
+ st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
242
+ else:
243
+ st.error("Generate questions first before generating answers.")
244
+
245
+ # Download buttons for questions and answers
246
+ if "generated_questions" in st.session_state:
247
+ st.sidebar.download_button(
248
+ label="Download Questions",
249
+ data=st.session_state.generated_questions,
250
+ file_name=f"{subject_name}_questions.txt",
251
+ mime="text/plain",
252
+ )
253
+
254
+ if "generated_answers" in st.session_state:
255
+ st.sidebar.download_button(
256
+ label="Download Answers",
257
+ data=st.session_state.generated_answers,
258
+ file_name=f"{subject_name}_answers.txt",
259
+ mime="text/plain",
260
+ )
261
+
262
+ # Enhanced footer for branding and information
263
+ st.markdown("""
264
+ ---
265
+ **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
266
+
267
+ Built with ♥ to make exam preparation seamless.
268
+ """)