ahm14 commited on
Commit
d947799
·
verified ·
1 Parent(s): e36e1ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -194
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  from langchain_groq import ChatGroq
3
  from langchain_core.output_parsers import StrOutputParser
@@ -11,6 +12,8 @@ from io import BytesIO
11
  import logging
12
  from concurrent.futures import ThreadPoolExecutor
13
  from streamlit.runtime.caching import cache_data
 
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -21,10 +24,10 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
21
  # Initialize LLM
22
  llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
23
 
24
- # OCR Configuration for Pytesseract
25
- pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
26
 
27
- # Enhanced OCR with configurable language option
28
  def extract_text_from_images(images, lang="eng"):
29
  ocr_text = ""
30
  for image in images:
@@ -34,35 +37,23 @@ def extract_text_from_images(images, lang="eng"):
34
  logging.error(f"Error in OCR: {e}")
35
  return ocr_text.strip()
36
 
37
- # Function to extract data from PDF with parallelization
38
  @cache_data
39
- def extract_pdf_data(pdf_path):
40
- data = {"text": "", "tables": [], "images": []}
41
  try:
42
- with pdfplumber.open(pdf_path) as pdf:
43
- def process_page(page):
44
- page_data = {"text": page.extract_text() or "", "tables": [], "images": []}
45
- tables = page.extract_tables()
46
- for table in tables:
47
- page_data["tables"].append(table)
48
- for image in page.images:
49
- base_image = pdf.extract_image(image["object_number"])
50
- image_obj = Image.open(BytesIO(base_image["image"]))
51
- page_data["images"].append(image_obj)
52
- return page_data
53
-
54
- with ThreadPoolExecutor() as executor:
55
- pages_data = list(executor.map(process_page, pdf.pages))
56
-
57
- for page_data in pages_data:
58
- data["text"] += page_data["text"]
59
- data["tables"].extend(page_data["tables"])
60
- data["images"].extend(page_data["images"])
61
  except Exception as e:
62
  logging.error(f"Error processing PDF: {e}")
63
  return data
64
 
65
- # Function to extract text from DOCX files
66
  @cache_data
67
  def extract_docx_data(docx_file):
68
  try:
@@ -70,65 +61,49 @@ def extract_docx_data(docx_file):
70
  text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
71
  return text
72
  except Exception as e:
73
- logging.error(f"Error extracting DOCX content: {e}")
74
  return ""
75
 
76
- # Function to extract text from plain text files
77
  @cache_data
78
- def extract_text_file_data(text_file):
79
  try:
80
- return text_file.read().decode("utf-8").strip()
81
  except Exception as e:
82
- logging.error(f"Error extracting TXT content: {e}")
83
  return ""
84
 
85
- # Function to process extracted content (PDF, DOCX, etc.)
86
- def process_content(file_data, file_type, lang="eng"):
87
- text = ""
88
  images = []
89
- if file_type == "pdf":
90
- pdf_data = extract_pdf_data(file_data)
91
- text = process_pdf_content(pdf_data)
92
- images = pdf_data["images"]
93
- elif file_type == "docx":
94
- text = extract_docx_data(file_data)
95
- elif file_type == "txt":
96
- text = extract_text_file_data(file_data)
97
- elif file_type in ["png", "jpg", "jpeg"]:
98
- image = Image.open(file_data)
99
- images.append(image)
100
-
101
  ocr_text = extract_text_from_images(images, lang)
102
- return text + "\n" + ocr_text
103
 
104
- # Function to process PDF content
105
- def process_pdf_content(pdf_data):
106
- ocr_text = extract_text_from_images(pdf_data["images"])
107
- combined_text = pdf_data["text"] + ocr_text
108
-
109
- table_text = ""
110
- for table in pdf_data["tables"]:
111
- table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
112
- table_text += "\n".join(table_rows) + "\n"
113
-
114
- return (combined_text + "\n" + table_text).strip()
115
-
116
- # Function to generate questions
117
- def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
118
  prompt_template = f"""
119
- Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
120
- Subject: {subject_name}
121
- Instructor: {instructor}
122
- Class: {class_name}
123
- Institution: {institution}
124
- Syllabus Content: {syllabus_context}
125
  Difficulty Levels:
126
- - Remember: {difficulty_level.get('Remember', 0)}
127
- - Understand: {difficulty_level.get('Understand', 0)}
128
- - Apply: {difficulty_level.get('Apply', 0)}
129
- - Analyze: {difficulty_level.get('Analyze', 0)}
130
- - Evaluate: {difficulty_level.get('Evaluate', 0)}
131
- - Create: {difficulty_level.get('Create', 0)}
132
  Format questions as follows:
133
  Q1. ________________
134
  Q2. ________________
@@ -138,131 +113,87 @@ def generate_questions(question_type, subject_name, instructor, class_name, inst
138
  try:
139
  return chain.invoke({})
140
  except Exception as e:
141
- logging.error(f"Error generating {question_type} questions: {e}")
142
  return ""
143
 
144
- # Function to generate answers
145
- def generate_answers(questions, syllabus_context):
146
- prompt = f"""
147
- Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
148
- Syllabus Content: {syllabus_context}
149
- Questions:
150
- {questions}
151
- Format answers as follows:
152
- Answer 1: ________________
153
- Answer 2: ________________
154
- ...
155
- """
156
- chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
157
  try:
158
- return chain.invoke({})
 
 
 
159
  except Exception as e:
160
- logging.error(f"Error generating answers: {e}")
161
- return ""
162
-
163
- # Streamlit app
164
- st.title("Bloom's Taxonomy Based Exam Paper Developer")
165
-
166
- # Sidebar Clear Data Button
167
- if st.sidebar.button("Clear All Data"):
168
- st.session_state.clear()
169
- st.success("All data has been cleared. You can now upload a new syllabus.")
170
-
171
- # File Upload with Image Support
172
- uploaded_file = st.sidebar.file_uploader(
173
- "Upload Syllabus (PDF, DOCX, TXT, Image)",
174
- type=["pdf", "docx", "txt", "png", "jpg", "jpeg"]
175
- )
176
-
177
- # Sidebar Inputs for Subject Name, Instructor, Class, and Institution
178
- subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
179
- instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
180
- class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
181
- institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
182
-
183
- # Language Option for OCR
184
- ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
185
-
186
- if uploaded_file:
187
- # Clear session state when a new file is uploaded
188
- if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
189
- st.session_state.clear()
190
- st.success("Previous data cleared. Processing new file...")
191
-
192
- st.session_state.uploaded_filename = uploaded_file.name
193
- file_type = uploaded_file.type.split("/")[-1]
194
-
195
- # Validate file type
196
- if file_type not in ["pdf", "docx", "txt", "png", "jpg", "jpeg"]:
197
- st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
198
- else:
199
- syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
200
- st.session_state.syllabus_text = syllabus_text
201
-
202
- # Preview of Syllabus
203
- if "syllabus_text" in st.session_state:
204
- st.subheader("Syllabus Preview:")
205
- st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
206
- else:
207
- st.warning("Please upload a syllabus to begin.")
208
-
209
- # Question Type Selection
210
- question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
211
- difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
212
- difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
213
- num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
214
-
215
- if st.sidebar.button("Generate Questions"):
216
  if "syllabus_text" in st.session_state:
217
- with st.spinner(f"Generating {question_type}..."):
218
- syllabus_context = st.session_state.syllabus_text
219
- st.session_state.generated_questions = generate_questions(
220
- question_type,
221
- subject_name,
222
- instructor_name,
223
- class_name,
224
- institution_name,
225
- syllabus_context,
226
- num_questions,
227
- difficulty,
228
- )
229
- st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
230
  else:
231
- st.error("Please upload a syllabus before generating questions.")
232
-
233
- # Button to generate answers for questions
234
- if st.sidebar.button("Generate Answers for Questions"):
235
- if "generated_questions" in st.session_state:
236
- with st.spinner("Generating answers..."):
237
- syllabus_context = st.session_state.syllabus_text
238
- st.session_state.generated_answers = generate_answers(
239
- st.session_state.generated_questions, syllabus_context
240
- )
241
- st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
 
 
 
 
 
 
 
 
 
 
 
242
  else:
243
- st.error("Generate questions first before generating answers.")
244
-
245
- # Download buttons for questions and answers
246
- if "generated_questions" in st.session_state:
247
- st.sidebar.download_button(
248
- label="Download Questions",
249
- data=st.session_state.generated_questions,
250
- file_name=f"{subject_name}_questions.txt",
251
- mime="text/plain",
252
- )
253
-
254
- if "generated_answers" in st.session_state:
255
- st.sidebar.download_button(
256
- label="Download Answers",
257
- data=st.session_state.generated_answers,
258
- file_name=f"{subject_name}_answers.txt",
259
- mime="text/plain",
260
- )
261
-
262
- # Enhanced footer for branding and information
263
- st.markdown("""
264
- ---
265
- **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
266
-
267
- Built with ♥ to make exam preparation seamless.
268
- """)
 
1
+
2
  import streamlit as st
3
  from langchain_groq import ChatGroq
4
  from langchain_core.output_parsers import StrOutputParser
 
12
  import logging
13
  from concurrent.futures import ThreadPoolExecutor
14
  from streamlit.runtime.caching import cache_data
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
 
18
  # Load environment variables
19
  load_dotenv()
 
24
  # Initialize LLM
25
  llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
26
 
27
+ # OCR Configuration
28
+ pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust based on your system's path
29
 
30
+ # OCR Extraction from Images
31
  def extract_text_from_images(images, lang="eng"):
32
  ocr_text = ""
33
  for image in images:
 
37
  logging.error(f"Error in OCR: {e}")
38
  return ocr_text.strip()
39
 
40
+ # Extract content from PDFs
41
  @cache_data
42
+ def extract_pdf_data(pdf_file):
43
+ data = {"text": "", "images": []}
44
  try:
45
+ with pdfplumber.open(pdf_file) as pdf:
46
+ for page in pdf.pages:
47
+ data["text"] += page.extract_text() or ""
48
+ for img in page.images:
49
+ base_image = pdf.extract_image(img["object_number"])
50
+ image = Image.open(BytesIO(base_image["image"]))
51
+ data["images"].append(image)
 
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
  logging.error(f"Error processing PDF: {e}")
54
  return data
55
 
56
+ # Extract content from DOCX
57
  @cache_data
58
  def extract_docx_data(docx_file):
59
  try:
 
61
  text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
62
  return text
63
  except Exception as e:
64
+ logging.error(f"Error processing DOCX: {e}")
65
  return ""
66
 
67
+ # Extract plain text files
68
  @cache_data
69
+ def extract_txt_data(txt_file):
70
  try:
71
+ return txt_file.read().decode("utf-8").strip()
72
  except Exception as e:
73
+ logging.error(f"Error processing TXT: {e}")
74
  return ""
75
 
76
+ # Process uploaded files
77
+ def process_files(uploaded_files, lang="eng"):
78
+ combined_text = ""
79
  images = []
80
+ for file in uploaded_files:
81
+ file_type = file.type.split("/")[-1]
82
+ if file_type == "pdf":
83
+ pdf_data = extract_pdf_data(file)
84
+ combined_text += pdf_data["text"]
85
+ images.extend(pdf_data["images"])
86
+ elif file_type == "docx":
87
+ combined_text += extract_docx_data(file)
88
+ elif file_type == "txt":
89
+ combined_text += extract_txt_data(file)
90
+ elif file_type in ["png", "jpg", "jpeg"]:
91
+ images.append(Image.open(file))
92
  ocr_text = extract_text_from_images(images, lang)
93
+ return combined_text + "\n" + ocr_text
94
 
95
+ # Generate questions
96
+ def generate_questions(question_type, syllabus_text, num_questions, difficulty):
 
 
 
 
 
 
 
 
 
 
 
 
97
  prompt_template = f"""
98
+ Generate {num_questions} {question_type} questions from the syllabus content provided below.
99
+ Syllabus Content: {syllabus_text}
 
 
 
 
100
  Difficulty Levels:
101
+ - Remember: {difficulty.get('Remember', 0)}
102
+ - Understand: {difficulty.get('Understand', 0)}
103
+ - Apply: {difficulty.get('Apply', 0)}
104
+ - Analyze: {difficulty.get('Analyze', 0)}
105
+ - Evaluate: {difficulty.get('Evaluate', 0)}
106
+ - Create: {difficulty.get('Create', 0)}
107
  Format questions as follows:
108
  Q1. ________________
109
  Q2. ________________
 
113
  try:
114
  return chain.invoke({})
115
  except Exception as e:
116
+ logging.error(f"Error generating questions: {e}")
117
  return ""
118
 
119
+ # Internet search for answers
120
+ def search_answers_online(question):
121
+ search_url = f"https://www.google.com/search?q={question}"
122
+ headers = {"User-Agent": "Mozilla/5.0"}
 
 
 
 
 
 
 
 
 
123
  try:
124
+ response = requests.get(search_url, headers=headers)
125
+ soup = BeautifulSoup(response.text, "html.parser")
126
+ snippets = soup.find_all("div", class_="BNeawe")
127
+ return "\n".join([snippet.get_text() for snippet in snippets[:3]])
128
  except Exception as e:
129
+ logging.error(f"Error fetching online answers: {e}")
130
+ return "No online answer found."
131
+
132
+ # Generate answers
133
+ def generate_answers(questions, syllabus_text):
134
+ answers = {}
135
+ for i, question in enumerate(questions.split("\n")):
136
+ if question.strip():
137
+ prompt = f"""
138
+ Based on the provided syllabus content, generate a detailed answer for the following question:
139
+ Syllabus Content: {syllabus_text}
140
+ Question: {question}
141
+ """
142
+ chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
143
+ try:
144
+ answers[f"Answer {i+1}"] = chain.invoke({})
145
+ except Exception:
146
+ # Fall back to online search if LLM fails
147
+ answers[f"Answer {i+1}"] = search_answers_online(question)
148
+ return "\n".join([f"{k}: {v}" for k, v in answers.items()])
149
+
150
+ # Streamlit UI
151
+ st.title("AI-Powered Exam Generator")
152
+
153
+ # Tabs for navigation
154
+ tab1, tab2, tab3, tab4 = st.tabs(["📁 Upload Files", "📄 Preview Content", "📝 Generate Questions", "💡 Generate Answers"])
155
+
156
+ # Upload files
157
+ with tab1:
158
+ st.header("Upload Files")
159
+ uploaded_files = st.file_uploader(
160
+ "Upload your syllabus (PDF, DOCX, TXT, Images)",
161
+ type=["pdf", "docx", "txt", "png", "jpg", "jpeg"],
162
+ accept_multiple_files=True
163
+ )
164
+ ocr_lang = st.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
165
+ if uploaded_files:
166
+ syllabus_text = process_files(uploaded_files, lang=ocr_lang)
167
+ st.session_state["syllabus_text"] = syllabus_text
168
+ st.success("Files processed successfully!")
169
+
170
+ # Preview content
171
+ with tab2:
172
+ st.header("Preview Syllabus Content")
 
 
 
 
 
 
 
 
 
 
 
 
173
  if "syllabus_text" in st.session_state:
174
+ st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
 
 
 
 
 
 
 
 
 
 
 
 
175
  else:
176
+ st.warning("No content available. Upload files first.")
177
+
178
+ # Generate questions
179
+ with tab3:
180
+ st.header("Generate Questions")
181
+ question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions"])
182
+ num_questions = st.slider("Number of Questions", 1, 20, 5)
183
+ difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
184
+ difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
185
+ if st.button("Generate Questions"):
186
+ questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty)
187
+ st.session_state["questions"] = questions
188
+ st.text_area("Generated Questions", questions, height=300)
189
+
190
+ # Generate answers
191
+ with tab4:
192
+ st.header("Generate Answers")
193
+ if "questions" in st.session_state:
194
+ if st.button("Generate Answers"):
195
+ answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
196
+ st.session_state["answers"] = answers
197
+ st.text_area("Generated Answers", answers, height=300)
198
  else:
199
+ st.warning("No questions available. Generate questions first.")