ahm14 commited on
Commit
7337e1e
·
verified ·
1 Parent(s): efbebca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -93
app.py CHANGED
@@ -3,49 +3,24 @@ from langchain_groq import ChatGroq
3
  from langchain_core.output_parsers import StrOutputParser
4
  from langchain_core.prompts import ChatPromptTemplate
5
  from dotenv import load_dotenv
6
- import os
7
  import pytesseract
8
  from PIL import Image
9
  import pdfplumber
10
  import docx
11
  from io import BytesIO
12
- from sentence_transformers import SentenceTransformer
13
- from pinecone import Pinecone, ServerlessSpec
14
  import logging
15
 
16
  # Load environment variables
17
  load_dotenv()
18
 
19
  # Initialize logging
20
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
 
22
  # Initialize LLM
23
  llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
24
 
25
- # Initialize Pinecone for vector storage
26
- PINECONE_API_KEY = "pcsk_6PtxDh_6tortuWyNhXdmVrAjx1ZSv8bQRcbgbE7j3JtwwcpMCkFfdsp6VC925WxmqpNYQC"
27
- pc = Pinecone(api_key=PINECONE_API_KEY)
28
-
29
- cloud = os.getenv('PINECONE_CLOUD', 'aws')
30
- region = os.getenv('PINECONE_REGION', 'us-east-1')
31
-
32
- spec = ServerlessSpec(cloud=cloud, region=region)
33
-
34
- index_name = "syllabus-index"
35
- if index_name not in pc.list_indexes().names():
36
- pc.create_index(
37
- name=index_name,
38
- dimension=384,
39
- spec=spec
40
- )
41
-
42
- index = pc.Index(index_name)
43
-
44
- # Initialize embedding model
45
- embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
-
47
  # OCR Configuration for Pytesseract
48
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Adjust to your system's path
49
 
50
  # Function to extract text, images, tables, and formulas from PDF
51
  def extract_pdf_data(pdf_path):
@@ -53,13 +28,10 @@ def extract_pdf_data(pdf_path):
53
  try:
54
  with pdfplumber.open(pdf_path) as pdf:
55
  for page in pdf.pages:
56
- # Extract Text
57
  data["text"] += page.extract_text() or ""
58
- # Extract Tables
59
  tables = page.extract_tables()
60
  for table in tables:
61
  data["tables"].append(table)
62
- # Extract Images
63
  for image in page.images:
64
  base_image = pdf.extract_image(image["object_number"])
65
  image_obj = Image.open(BytesIO(base_image["image"]))
@@ -70,25 +42,34 @@ def extract_pdf_data(pdf_path):
70
 
71
  # Function to extract text from DOCX files
72
  def extract_docx_data(docx_file):
73
- doc = docx.Document(docx_file)
74
- text = ""
75
- for para in doc.paragraphs:
76
- text += para.text + "\n"
77
- return text
 
 
78
 
79
  # Function to extract text from plain text files
80
  def extract_text_file_data(text_file):
81
- return text_file.read().decode('utf-8')
 
 
 
 
82
 
83
  # Function to extract text from images using OCR
84
  def extract_text_from_images(images):
85
  ocr_text = ""
86
  for image in images:
87
- ocr_text += pytesseract.image_to_string(image) + "\n"
88
- return ocr_text
 
 
 
89
 
90
  # Function to process extracted content (PDF, DOCX, etc.)
91
- def process_content(file_data, file_type="pdf"):
92
  text = ""
93
  images = []
94
  if file_type == "pdf":
@@ -99,48 +80,34 @@ def process_content(file_data, file_type="pdf"):
99
  text = extract_docx_data(file_data)
100
  elif file_type == "txt":
101
  text = extract_text_file_data(file_data)
 
 
 
102
 
103
  ocr_text = extract_text_from_images(images)
104
  return text + "\n" + ocr_text
105
 
106
  # Function to process PDF content
107
  def process_pdf_content(pdf_data):
108
- # Process OCR text from images
109
  ocr_text = extract_text_from_images(pdf_data["images"])
110
  combined_text = pdf_data["text"] + ocr_text
111
 
112
- # Process tables into readable text
113
  table_text = ""
114
  for table in pdf_data["tables"]:
115
- table_rows = [" | ".join(row) for row in table]
116
  table_text += "\n".join(table_rows) + "\n"
117
 
118
- return combined_text + "\n" + table_text
119
-
120
- # Function to add syllabus to vector database
121
- def add_syllabus_to_index(syllabus_text):
122
- sentences = syllabus_text.split(". ")
123
- embeddings = embedder.encode(sentences, batch_size=32, show_progress_bar=True)
124
- for i, sentence in enumerate(sentences):
125
- index.upsert([(f"sentence-{i}", embeddings[i].tolist(), {"text": sentence})])
126
-
127
- # Function to retrieve relevant syllabus content
128
- def retrieve_relevant_content(query):
129
- try:
130
- query_embedding = embedder.encode([query])
131
- results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True)
132
- relevant_content = "\n".join([match["metadata"]["text"] for match in results["matches"]])
133
- return relevant_content
134
- except Exception as e:
135
- logging.error(f"Error retrieving content: {e}")
136
- return ""
137
 
138
  # Function to generate questions
139
- def generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty_level):
140
  prompt_template = f"""
141
  Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
142
 
143
  Subject: {subject_name}
 
 
 
144
  Syllabus Content: {syllabus_context}
145
 
146
  Difficulty Levels:
@@ -190,25 +157,45 @@ def generate_answers(questions, syllabus_context):
190
  # Streamlit app
191
  st.title("Bloom's Taxonomy Based Exam Paper Developer")
192
 
193
- # Sidebar inputs
194
- instructor_name = st.sidebar.text_input("Instructor")
195
- class_name = st.sidebar.text_input("Class")
196
- institution_name = st.sidebar.text_input("Institution")
197
- subject_name = st.sidebar.text_input("Subject")
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- # Syllabus Upload
200
- uploaded_file = st.sidebar.file_uploader("Upload Syllabus (PDF, DOCX, TXT, Image)", type=["pdf", "docx", "txt", "png", "jpg"])
201
- syllabus_text = None
202
  if uploaded_file:
203
- file_type = uploaded_file.type.split("/")[1]
204
- st.sidebar.markdown("✅ Syllabus uploaded")
205
- syllabus_text = process_content(uploaded_file, file_type)
206
- add_syllabus_to_index(syllabus_text)
 
 
 
 
 
 
 
 
 
 
207
 
208
  # Preview of Syllabus
209
- if syllabus_text:
210
  st.subheader("Syllabus Preview:")
211
- st.text_area("Extracted Content", syllabus_text[:1000], height=300)
 
 
212
 
213
  # Question Type Selection
214
  question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
@@ -216,31 +203,25 @@ difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "
216
  difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
217
  num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
218
 
219
- # Instructor Feedback Option
220
- feedback = st.sidebar.text_area("Instructor Feedback (Optional)")
221
-
222
- # Generate Questions
223
  if st.sidebar.button("Generate Questions"):
224
- if syllabus_text:
225
  with st.spinner(f"Generating {question_type}..."):
226
- syllabus_context = retrieve_relevant_content(f"Generate {question_type} based on syllabus")
227
- st.session_state.generated_questions = generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty)
228
  st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
229
  else:
230
  st.error("Please upload a syllabus before generating questions.")
231
 
232
- # Generate Answers
233
  if st.sidebar.button("Generate Answers for Questions"):
234
- if "generated_questions" in st.session_state and st.session_state.generated_questions:
235
  with st.spinner("Generating answers..."):
236
- syllabus_context = retrieve_relevant_content("Generate answers from syllabus")
237
  st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
238
  st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
239
  else:
240
  st.error("Generate questions first before generating answers.")
241
 
242
- # Download Options
243
- if "generated_questions" in st.session_state and st.session_state.generated_questions:
244
  st.sidebar.download_button(
245
  label="Download Questions",
246
  data=st.session_state.generated_questions,
@@ -248,7 +229,7 @@ if "generated_questions" in st.session_state and st.session_state.generated_ques
248
  mime="text/plain",
249
  )
250
 
251
- if "generated_answers" in st.session_state and st.session_state.generated_answers:
252
  st.sidebar.download_button(
253
  label="Download Answers",
254
  data=st.session_state.generated_answers,
@@ -256,8 +237,7 @@ if "generated_answers" in st.session_state and st.session_state.generated_answer
256
  mime="text/plain",
257
  )
258
 
259
- # Application Footer
260
- st.markdown("""
261
- ---
262
- **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
263
  """)
 
3
  from langchain_core.output_parsers import StrOutputParser
4
  from langchain_core.prompts import ChatPromptTemplate
5
  from dotenv import load_dotenv
 
6
  import pytesseract
7
  from PIL import Image
8
  import pdfplumber
9
  import docx
10
  from io import BytesIO
 
 
11
  import logging
12
 
13
  # Load environment variables
14
  load_dotenv()
15
 
16
  # Initialize logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
 
19
  # Initialize LLM
20
  llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # OCR Configuration for Pytesseract
23
+ pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
24
 
25
  # Function to extract text, images, tables, and formulas from PDF
26
  def extract_pdf_data(pdf_path):
 
28
  try:
29
  with pdfplumber.open(pdf_path) as pdf:
30
  for page in pdf.pages:
 
31
  data["text"] += page.extract_text() or ""
 
32
  tables = page.extract_tables()
33
  for table in tables:
34
  data["tables"].append(table)
 
35
  for image in page.images:
36
  base_image = pdf.extract_image(image["object_number"])
37
  image_obj = Image.open(BytesIO(base_image["image"]))
 
42
 
43
  # Function to extract text from DOCX files
44
  def extract_docx_data(docx_file):
45
+ try:
46
+ doc = docx.Document(docx_file)
47
+ text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
48
+ return text
49
+ except Exception as e:
50
+ logging.error(f"Error extracting DOCX content: {e}")
51
+ return ""
52
 
53
  # Function to extract text from plain text files
54
  def extract_text_file_data(text_file):
55
+ try:
56
+ return text_file.read().decode("utf-8").strip()
57
+ except Exception as e:
58
+ logging.error(f"Error extracting TXT content: {e}")
59
+ return ""
60
 
61
  # Function to extract text from images using OCR
62
  def extract_text_from_images(images):
63
  ocr_text = ""
64
  for image in images:
65
+ try:
66
+ ocr_text += pytesseract.image_to_string(image).strip() + "\n"
67
+ except Exception as e:
68
+ logging.error(f"Error in OCR: {e}")
69
+ return ocr_text.strip()
70
 
71
  # Function to process extracted content (PDF, DOCX, etc.)
72
+ def process_content(file_data, file_type):
73
  text = ""
74
  images = []
75
  if file_type == "pdf":
 
80
  text = extract_docx_data(file_data)
81
  elif file_type == "txt":
82
  text = extract_text_file_data(file_data)
83
+ elif file_type in ["png", "jpg", "jpeg"]:
84
+ image = Image.open(file_data)
85
+ images.append(image)
86
 
87
  ocr_text = extract_text_from_images(images)
88
  return text + "\n" + ocr_text
89
 
90
  # Function to process PDF content
91
  def process_pdf_content(pdf_data):
 
92
  ocr_text = extract_text_from_images(pdf_data["images"])
93
  combined_text = pdf_data["text"] + ocr_text
94
 
 
95
  table_text = ""
96
  for table in pdf_data["tables"]:
97
+ table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
98
  table_text += "\n".join(table_rows) + "\n"
99
 
100
+ return (combined_text + "\n" + table_text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Function to generate questions
103
+ def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
104
  prompt_template = f"""
105
  Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
106
 
107
  Subject: {subject_name}
108
+ Instructor: {instructor}
109
+ Class: {class_name}
110
+ Institution: {institution}
111
  Syllabus Content: {syllabus_context}
112
 
113
  Difficulty Levels:
 
157
  # Streamlit app
158
  st.title("Bloom's Taxonomy Based Exam Paper Developer")
159
 
160
+ # Sidebar Clear Data Button
161
+ if st.sidebar.button("Clear All Data"):
162
+ st.session_state.clear()
163
+ st.success("All data has been cleared. You can now upload a new syllabus.")
164
+
165
+ # Syllabus Upload with Automatic Clearing
166
+ uploaded_file = st.sidebar.file_uploader(
167
+ "Upload Syllabus (PDF, DOCX, TXT, Image)",
168
+ type=["pdf", "docx", "txt", "png", "jpg"]
169
+ )
170
+
171
+ # Sidebar Inputs for Subject Name, Instructor, Class, and Institution
172
+ subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
173
+ instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
174
+ class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
175
+ institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
176
 
 
 
 
177
  if uploaded_file:
178
+ # Clear session state when a new file is uploaded
179
+ if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
180
+ st.session_state.clear()
181
+ st.success("Previous data cleared. Processing new file...")
182
+
183
+ st.session_state.uploaded_filename = uploaded_file.name
184
+ file_type = uploaded_file.type.split("/")[-1]
185
+
186
+ # Validate file type
187
+ if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
188
+ st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
189
+ else:
190
+ syllabus_text = process_content(uploaded_file, file_type)
191
+ st.session_state.syllabus_text = syllabus_text
192
 
193
  # Preview of Syllabus
194
+ if "syllabus_text" in st.session_state:
195
  st.subheader("Syllabus Preview:")
196
+ st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
197
+ else:
198
+ st.warning("Please upload a syllabus to begin.")
199
 
200
  # Question Type Selection
201
  question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
 
203
  difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
204
  num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
205
 
 
 
 
 
206
  if st.sidebar.button("Generate Questions"):
207
+ if "syllabus_text" in st.session_state:
208
  with st.spinner(f"Generating {question_type}..."):
209
+ syllabus_context = st.session_state.syllabus_text
210
+ st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
211
  st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
212
  else:
213
  st.error("Please upload a syllabus before generating questions.")
214
 
 
215
  if st.sidebar.button("Generate Answers for Questions"):
216
+ if "generated_questions" in st.session_state:
217
  with st.spinner("Generating answers..."):
218
+ syllabus_context = st.session_state.syllabus_text
219
  st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
220
  st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
221
  else:
222
  st.error("Generate questions first before generating answers.")
223
 
224
+ if "generated_questions" in st.session_state:
 
225
  st.sidebar.download_button(
226
  label="Download Questions",
227
  data=st.session_state.generated_questions,
 
229
  mime="text/plain",
230
  )
231
 
232
+ if "generated_answers" in st.session_state:
233
  st.sidebar.download_button(
234
  label="Download Answers",
235
  data=st.session_state.generated_answers,
 
237
  mime="text/plain",
238
  )
239
 
240
+ st.markdown("""
241
+ ---
242
+ **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
 
243
  """)