ahm14 commited on
Commit
644ed04
·
verified ·
1 Parent(s): c4be7cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -40
app.py CHANGED
@@ -1,15 +1,15 @@
1
-
2
  import streamlit as st
3
  from langchain_groq import ChatGroq
4
  from langchain_core.output_parsers import StrOutputParser
5
  from langchain_core.prompts import ChatPromptTemplate
6
  from dotenv import load_dotenv
7
  import pytesseract
8
- from PIL import Image
9
  import pdfplumber
10
  import docx
11
  from io import BytesIO
12
  import logging
 
13
  from concurrent.futures import ThreadPoolExecutor
14
  from streamlit.runtime.caching import cache_data
15
  import requests
@@ -27,17 +27,27 @@ llm = ChatGroq(temperature=0.5, groq_api_key="gsk_cnE3PNB19Dg4H2UNQ1zbWGdyb3FYsl
27
  # OCR Configuration
28
  pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust based on your system's path
29
 
30
- # OCR Extraction from Images
 
 
 
 
 
 
 
 
 
31
  def extract_text_from_images(images, lang="eng"):
32
  ocr_text = ""
33
  for image in images:
34
  try:
35
- ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
 
36
  except Exception as e:
37
  logging.error(f"Error in OCR: {e}")
38
  return ocr_text.strip()
39
 
40
- # Extract content from PDFs
41
  @cache_data
42
  def extract_pdf_data(pdf_file):
43
  data = {"text": "", "images": []}
@@ -53,7 +63,7 @@ def extract_pdf_data(pdf_file):
53
  logging.error(f"Error processing PDF: {e}")
54
  return data
55
 
56
- # Extract content from DOCX
57
  @cache_data
58
  def extract_docx_data(docx_file):
59
  try:
@@ -64,7 +74,7 @@ def extract_docx_data(docx_file):
64
  logging.error(f"Error processing DOCX: {e}")
65
  return ""
66
 
67
- # Extract plain text files
68
  @cache_data
69
  def extract_txt_data(txt_file):
70
  try:
@@ -73,50 +83,52 @@ def extract_txt_data(txt_file):
73
  logging.error(f"Error processing TXT: {e}")
74
  return ""
75
 
76
- # Process uploaded files
77
  def process_files(uploaded_files, lang="eng"):
78
  combined_text = ""
79
  images = []
80
- for file in uploaded_files:
 
81
  file_type = file.type.split("/")[-1]
82
  if file_type == "pdf":
83
  pdf_data = extract_pdf_data(file)
84
- combined_text += pdf_data["text"]
85
- images.extend(pdf_data["images"])
86
  elif file_type == "docx":
87
- combined_text += extract_docx_data(file)
88
  elif file_type == "txt":
89
- combined_text += extract_txt_data(file)
90
  elif file_type in ["png", "jpg", "jpeg"]:
91
- images.append(Image.open(file))
 
 
 
 
 
 
 
 
 
 
 
92
  ocr_text = extract_text_from_images(images, lang)
93
  return combined_text + "\n" + ocr_text
94
 
95
- # Generate questions
96
- def generate_questions(question_type, syllabus_text, num_questions, difficulty):
97
- prompt_template = f"""
98
- Generate {num_questions} {question_type} questions from the syllabus content provided below.
99
- Syllabus Content: {syllabus_text}
100
- Difficulty Levels:
101
- - Remember: {difficulty.get('Remember', 0)}
102
- - Understand: {difficulty.get('Understand', 0)}
103
- - Apply: {difficulty.get('Apply', 0)}
104
- - Analyze: {difficulty.get('Analyze', 0)}
105
- - Evaluate: {difficulty.get('Evaluate', 0)}
106
- - Create: {difficulty.get('Create', 0)}
107
- Format questions as follows:
108
- Q1. ________________
109
- Q2. ________________
110
- ...
111
- """
112
- chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
113
  try:
114
  return chain.invoke({})
115
  except Exception as e:
116
  logging.error(f"Error generating questions: {e}")
117
  return ""
118
 
119
- # Internet search for answers
120
  def search_answers_online(question):
121
  search_url = f"https://www.google.com/search?q={question}"
122
  headers = {"User-Agent": "Mozilla/5.0"}
@@ -129,7 +141,7 @@ def search_answers_online(question):
129
  logging.error(f"Error fetching online answers: {e}")
130
  return "No online answer found."
131
 
132
- # Generate answers
133
  def generate_answers(questions, syllabus_text):
134
  answers = {}
135
  for i, question in enumerate(questions.split("\n")):
@@ -172,21 +184,43 @@ with tab2:
172
  st.header("Preview Syllabus Content")
173
  if "syllabus_text" in st.session_state:
174
  st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
 
 
 
175
  else:
176
  st.warning("No content available. Upload files first.")
177
 
178
  # Generate questions
179
  with tab3:
180
  st.header("Generate Questions")
181
- question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions"])
182
- num_questions = st.slider("Number of Questions", 1, 20, 5)
183
  difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
184
  difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
185
- if st.button("Generate Questions"):
186
- questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  st.session_state["questions"] = questions
188
  st.text_area("Generated Questions", questions, height=300)
189
 
 
 
 
190
  # Generate answers
191
  with tab4:
192
  st.header("Generate Answers")
@@ -195,5 +229,6 @@ with tab4:
195
  answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
196
  st.session_state["answers"] = answers
197
  st.text_area("Generated Answers", answers, height=300)
198
- else:
199
- st.warning("No questions available. Generate questions first.")
 
 
 
1
  import streamlit as st
2
  from langchain_groq import ChatGroq
3
  from langchain_core.output_parsers import StrOutputParser
4
  from langchain_core.prompts import ChatPromptTemplate
5
  from dotenv import load_dotenv
6
  import pytesseract
7
+ from PIL import Image, ImageEnhance
8
  import pdfplumber
9
  import docx
10
  from io import BytesIO
11
  import logging
12
+ import os
13
  from concurrent.futures import ThreadPoolExecutor
14
  from streamlit.runtime.caching import cache_data
15
  import requests
 
27
  # OCR Configuration
28
  pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust based on your system's path
29
 
30
+ # Function to enhance image for OCR processing
31
+ def enhance_image_for_ocr(image):
32
+ # Convert to grayscale for better processing
33
+ gray_image = image.convert("L")
34
+ # Increase contrast for better text clarity
35
+ enhancer = ImageEnhance.Contrast(gray_image)
36
+ enhanced_image = enhancer.enhance(2.0) # Increase contrast
37
+ return enhanced_image
38
+
39
+ # Function to extract text from images using OCR
40
  def extract_text_from_images(images, lang="eng"):
41
  ocr_text = ""
42
  for image in images:
43
  try:
44
+ enhanced_image = enhance_image_for_ocr(image)
45
+ ocr_text += pytesseract.image_to_string(enhanced_image, lang=lang).strip() + "\n"
46
  except Exception as e:
47
  logging.error(f"Error in OCR: {e}")
48
  return ocr_text.strip()
49
 
50
+ # Function to extract content from PDFs
51
  @cache_data
52
  def extract_pdf_data(pdf_file):
53
  data = {"text": "", "images": []}
 
63
  logging.error(f"Error processing PDF: {e}")
64
  return data
65
 
66
+ # Function to extract content from DOCX files
67
  @cache_data
68
  def extract_docx_data(docx_file):
69
  try:
 
74
  logging.error(f"Error processing DOCX: {e}")
75
  return ""
76
 
77
+ # Function to extract plain text from TXT files
78
  @cache_data
79
  def extract_txt_data(txt_file):
80
  try:
 
83
  logging.error(f"Error processing TXT: {e}")
84
  return ""
85
 
86
+ # Process uploaded files in parallel and extract text and images
87
  def process_files(uploaded_files, lang="eng"):
88
  combined_text = ""
89
  images = []
90
+
91
+ def process_file(file):
92
  file_type = file.type.split("/")[-1]
93
  if file_type == "pdf":
94
  pdf_data = extract_pdf_data(file)
95
+ return pdf_data["text"], pdf_data["images"]
 
96
  elif file_type == "docx":
97
+ return extract_docx_data(file), []
98
  elif file_type == "txt":
99
+ return extract_txt_data(file), []
100
  elif file_type in ["png", "jpg", "jpeg"]:
101
+ return "", [Image.open(file)]
102
+ else:
103
+ logging.error(f"Unsupported file type: {file_type}")
104
+ return "", []
105
+
106
+ with ThreadPoolExecutor() as executor:
107
+ results = list(executor.map(process_file, uploaded_files))
108
+
109
+ for text, img_list in results:
110
+ combined_text += text
111
+ images.extend(img_list)
112
+
113
  ocr_text = extract_text_from_images(images, lang)
114
  return combined_text + "\n" + ocr_text
115
 
116
+ # Generate structured questions with MCQs, Fill-in-the-Blank, Case Studies
117
+ def generate_questions(question_type, syllabus_text, num_questions, difficulty, prompt_template):
118
+ formatted_prompt = prompt_template.format(
119
+ num_questions=num_questions,
120
+ question_type=question_type,
121
+ syllabus_text=syllabus_text,
122
+ **difficulty
123
+ )
124
+ chain = (ChatPromptTemplate.from_template(formatted_prompt) | llm | StrOutputParser())
 
 
 
 
 
 
 
 
 
125
  try:
126
  return chain.invoke({})
127
  except Exception as e:
128
  logging.error(f"Error generating questions: {e}")
129
  return ""
130
 
131
+ # Function to search answers online
132
  def search_answers_online(question):
133
  search_url = f"https://www.google.com/search?q={question}"
134
  headers = {"User-Agent": "Mozilla/5.0"}
 
141
  logging.error(f"Error fetching online answers: {e}")
142
  return "No online answer found."
143
 
144
+ # Generate answers for questions
145
  def generate_answers(questions, syllabus_text):
146
  answers = {}
147
  for i, question in enumerate(questions.split("\n")):
 
184
  st.header("Preview Syllabus Content")
185
  if "syllabus_text" in st.session_state:
186
  st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
187
+ if st.session_state.get("images"):
188
+ for img in st.session_state["images"]:
189
+ st.image(img, caption="Uploaded Image")
190
  else:
191
  st.warning("No content available. Upload files first.")
192
 
193
  # Generate questions
194
  with tab3:
195
  st.header("Generate Questions")
196
+ question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions", "Fill-in-the-Blank", "Case Study"])
197
+ num_questions = st.text_input("Total Number of Questions")
198
  difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
199
  difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
200
+ prompt_template = st.text_area(
201
+ "Edit Prompt Template",
202
+ """
203
+ Generate {num_questions} {question_type} questions from the syllabus content below.
204
+ Syllabus Content: {syllabus_text}
205
+ Difficulty Levels:
206
+ - Remember: {Remember}
207
+ - Understand: {Understand}
208
+ - Apply: {Apply}
209
+ - Analyze: {Analyze}
210
+ - Evaluate: {Evaluate}
211
+ - Create: {Create}
212
+ """,
213
+ height=200
214
+ )
215
+ if num_questions.isdigit() and st.button("Generate Questions"):
216
+ num_questions = int(num_questions)
217
+ questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty, prompt_template)
218
  st.session_state["questions"] = questions
219
  st.text_area("Generated Questions", questions, height=300)
220
 
221
+ # Download questions
222
+ st.download_button("Download Questions", questions, file_name="questions.txt")
223
+
224
  # Generate answers
225
  with tab4:
226
  st.header("Generate Answers")
 
229
  answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
230
  st.session_state["answers"] = answers
231
  st.text_area("Generated Answers", answers, height=300)
232
+
233
+ # Download answers
234
+ st.download_button("Download Answers", answers, file_name="answers.txt")