Spaces:

ahm14
/

Advanced_Exam_Generator

Build error

App Files Files Community

ahm14 commited on Jan 13, 2025

Commit

644ed04

verified ·

1 Parent(s): c4be7cf

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -40

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import streamlit as st
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from dotenv import load_dotenv
 import pytesseract
-from PIL import Image
 import pdfplumber
 import docx
 from io import BytesIO
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from streamlit.runtime.caching import cache_data
 import requests
@@ -27,17 +27,27 @@ llm = ChatGroq(temperature=0.5, groq_api_key="gsk_cnE3PNB19Dg4H2UNQ1zbWGdyb3FYsl
 # OCR Configuration
 pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust based on your system's path
-# OCR Extraction from Images
 def extract_text_from_images(images, lang="eng"):
     ocr_text = ""
     for image in images:
         try:
-            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
         except Exception as e:
             logging.error(f"Error in OCR: {e}")
     return ocr_text.strip()
-# Extract content from PDFs
 @cache_data
 def extract_pdf_data(pdf_file):
     data = {"text": "", "images": []}
@@ -53,7 +63,7 @@ def extract_pdf_data(pdf_file):
         logging.error(f"Error processing PDF: {e}")
     return data
-# Extract content from DOCX
 @cache_data
 def extract_docx_data(docx_file):
     try:
@@ -64,7 +74,7 @@ def extract_docx_data(docx_file):
         logging.error(f"Error processing DOCX: {e}")
         return ""
-# Extract plain text files
 @cache_data
 def extract_txt_data(txt_file):
     try:
@@ -73,50 +83,52 @@ def extract_txt_data(txt_file):
         logging.error(f"Error processing TXT: {e}")
         return ""
-# Process uploaded files
 def process_files(uploaded_files, lang="eng"):
     combined_text = ""
     images = []
-    for file in uploaded_files:
         file_type = file.type.split("/")[-1]
         if file_type == "pdf":
             pdf_data = extract_pdf_data(file)
-            combined_text += pdf_data["text"]
-            images.extend(pdf_data["images"])
         elif file_type == "docx":
-            combined_text += extract_docx_data(file)
         elif file_type == "txt":
-            combined_text += extract_txt_data(file)
         elif file_type in ["png", "jpg", "jpeg"]:
-            images.append(Image.open(file))
     ocr_text = extract_text_from_images(images, lang)
     return combined_text + "\n" + ocr_text
-# Generate questions
-def generate_questions(question_type, syllabus_text, num_questions, difficulty):
-    prompt_template = f"""
-    Generate {num_questions} {question_type} questions from the syllabus content provided below.
-    Syllabus Content: {syllabus_text}
-    Difficulty Levels:
-    - Remember: {difficulty.get('Remember', 0)}
-    - Understand: {difficulty.get('Understand', 0)}
-    - Apply: {difficulty.get('Apply', 0)}
-    - Analyze: {difficulty.get('Analyze', 0)}
-    - Evaluate: {difficulty.get('Evaluate', 0)}
-    - Create: {difficulty.get('Create', 0)}
-    Format questions as follows:
-    Q1. ________________
-    Q2. ________________
-    ...
-    """
-    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
     try:
         return chain.invoke({})
     except Exception as e:
         logging.error(f"Error generating questions: {e}")
         return ""
-# Internet search for answers
 def search_answers_online(question):
     search_url = f"https://www.google.com/search?q={question}"
     headers = {"User-Agent": "Mozilla/5.0"}
@@ -129,7 +141,7 @@ def search_answers_online(question):
         logging.error(f"Error fetching online answers: {e}")
         return "No online answer found."
-# Generate answers
 def generate_answers(questions, syllabus_text):
     answers = {}
     for i, question in enumerate(questions.split("\n")):
@@ -172,21 +184,43 @@ with tab2:
     st.header("Preview Syllabus Content")
     if "syllabus_text" in st.session_state:
         st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
     else:
         st.warning("No content available. Upload files first.")
 # Generate questions
 with tab3:
     st.header("Generate Questions")
-    question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions"])
-    num_questions = st.slider("Number of Questions", 1, 20, 5)
     difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
     difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
-    if st.button("Generate Questions"):
-        questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty)
         st.session_state["questions"] = questions
         st.text_area("Generated Questions", questions, height=300)
 # Generate answers
 with tab4:
     st.header("Generate Answers")
@@ -195,5 +229,6 @@ with tab4:
             answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
             st.session_state["answers"] = answers
             st.text_area("Generated Answers", answers, height=300)
-    else:
-        st.warning("No questions available. Generate questions first.")

 import streamlit as st
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from dotenv import load_dotenv
 import pytesseract
+from PIL import Image, ImageEnhance
 import pdfplumber
 import docx
 from io import BytesIO
 import logging
+import os
 from concurrent.futures import ThreadPoolExecutor
 from streamlit.runtime.caching import cache_data
 import requests
 # OCR Configuration
 pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust based on your system's path
+# Function to enhance image for OCR processing
+def enhance_image_for_ocr(image):
+    # Convert to grayscale for better processing
+    gray_image = image.convert("L")
+    # Increase contrast for better text clarity
+    enhancer = ImageEnhance.Contrast(gray_image)
+    enhanced_image = enhancer.enhance(2.0)  # Increase contrast
+    return enhanced_image
+# Function to extract text from images using OCR
 def extract_text_from_images(images, lang="eng"):
     ocr_text = ""
     for image in images:
         try:
+            enhanced_image = enhance_image_for_ocr(image)
+            ocr_text += pytesseract.image_to_string(enhanced_image, lang=lang).strip() + "\n"
         except Exception as e:
             logging.error(f"Error in OCR: {e}")
     return ocr_text.strip()
+# Function to extract content from PDFs
 @cache_data
 def extract_pdf_data(pdf_file):
     data = {"text": "", "images": []}
         logging.error(f"Error processing PDF: {e}")
     return data
+# Function to extract content from DOCX files
 @cache_data
 def extract_docx_data(docx_file):
     try:
         logging.error(f"Error processing DOCX: {e}")
         return ""
+# Function to extract plain text from TXT files
 @cache_data
 def extract_txt_data(txt_file):
     try:
         logging.error(f"Error processing TXT: {e}")
         return ""
+# Process uploaded files in parallel and extract text and images
 def process_files(uploaded_files, lang="eng"):
     combined_text = ""
     images = []
+    def process_file(file):
         file_type = file.type.split("/")[-1]
         if file_type == "pdf":
             pdf_data = extract_pdf_data(file)
+            return pdf_data["text"], pdf_data["images"]
         elif file_type == "docx":
+            return extract_docx_data(file), []
         elif file_type == "txt":
+            return extract_txt_data(file), []
         elif file_type in ["png", "jpg", "jpeg"]:
+            return "", [Image.open(file)]
+        else:
+            logging.error(f"Unsupported file type: {file_type}")
+            return "", []
+    with ThreadPoolExecutor() as executor:
+        results = list(executor.map(process_file, uploaded_files))
+    for text, img_list in results:
+        combined_text += text
+        images.extend(img_list)
     ocr_text = extract_text_from_images(images, lang)
     return combined_text + "\n" + ocr_text
+# Generate structured questions with MCQs, Fill-in-the-Blank, Case Studies
+def generate_questions(question_type, syllabus_text, num_questions, difficulty, prompt_template):
+    formatted_prompt = prompt_template.format(
+        num_questions=num_questions,
+        question_type=question_type,
+        syllabus_text=syllabus_text,
+        **difficulty
+    )
+    chain = (ChatPromptTemplate.from_template(formatted_prompt) | llm | StrOutputParser())
     try:
         return chain.invoke({})
     except Exception as e:
         logging.error(f"Error generating questions: {e}")
         return ""
+# Function to search answers online
 def search_answers_online(question):
     search_url = f"https://www.google.com/search?q={question}"
     headers = {"User-Agent": "Mozilla/5.0"}
         logging.error(f"Error fetching online answers: {e}")
         return "No online answer found."
+# Generate answers for questions
 def generate_answers(questions, syllabus_text):
     answers = {}
     for i, question in enumerate(questions.split("\n")):
     st.header("Preview Syllabus Content")
     if "syllabus_text" in st.session_state:
         st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
+        if st.session_state.get("images"):
+            for img in st.session_state["images"]:
+                st.image(img, caption="Uploaded Image")
     else:
         st.warning("No content available. Upload files first.")
 # Generate questions
 with tab3:
     st.header("Generate Questions")
+    question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions", "Fill-in-the-Blank", "Case Study"])
+    num_questions = st.text_input("Total Number of Questions")
     difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
     difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
+    prompt_template = st.text_area(
+        "Edit Prompt Template",
+        """
+        Generate {num_questions} {question_type} questions from the syllabus content below.
+        Syllabus Content: {syllabus_text}
+        Difficulty Levels:
+        - Remember: {Remember}
+        - Understand: {Understand}
+        - Apply: {Apply}
+        - Analyze: {Analyze}
+        - Evaluate: {Evaluate}
+        - Create: {Create}
+        """,
+        height=200
+    )
+    if num_questions.isdigit() and st.button("Generate Questions"):
+        num_questions = int(num_questions)
+        questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty, prompt_template)
         st.session_state["questions"] = questions
         st.text_area("Generated Questions", questions, height=300)
+        # Download questions
+        st.download_button("Download Questions", questions, file_name="questions.txt")
 # Generate answers
 with tab4:
     st.header("Generate Answers")
             answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
             st.session_state["answers"] = answers
             st.text_area("Generated Answers", answers, height=300)
+            # Download answers
+            st.download_button("Download Answers", answers, file_name="answers.txt")