Spaces:

ahm14
/

Advanced_Exam_Generator

Build error

App Files Files Community

ahm14 commited on Jan 13, 2025

Commit

4973d3f

verified ·

1 Parent(s): 2e7fff8

Create app.py

Browse files

Files changed (1) hide show

app.py +268 -0

app.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import streamlit as st
+from langchain_groq import ChatGroq
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from dotenv import load_dotenv
+import pytesseract
+from PIL import Image
+import pdfplumber
+import docx
+from io import BytesIO
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from streamlit.runtime.caching import cache_data
+# Load environment variables
+load_dotenv()
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Initialize LLM
+llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
+# OCR Configuration for Pytesseract
+pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
+# Enhanced OCR with configurable language option
+def extract_text_from_images(images, lang="eng"):
+    ocr_text = ""
+    for image in images:
+        try:
+            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
+        except Exception as e:
+            logging.error(f"Error in OCR: {e}")
+    return ocr_text.strip()
+# Function to extract data from PDF with parallelization
+@cache_data
+def extract_pdf_data(pdf_path):
+    data = {"text": "", "tables": [], "images": []}
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            def process_page(page):
+                page_data = {"text": page.extract_text() or "", "tables": [], "images": []}
+                tables = page.extract_tables()
+                for table in tables:
+                    page_data["tables"].append(table)
+                for image in page.images:
+                    base_image = pdf.extract_image(image["object_number"])
+                    image_obj = Image.open(BytesIO(base_image["image"]))
+                    page_data["images"].append(image_obj)
+                return page_data
+            with ThreadPoolExecutor() as executor:
+                pages_data = list(executor.map(process_page, pdf.pages))
+            for page_data in pages_data:
+                data["text"] += page_data["text"]
+                data["tables"].extend(page_data["tables"])
+                data["images"].extend(page_data["images"])
+    except Exception as e:
+        logging.error(f"Error processing PDF: {e}")
+    return data
+# Function to extract text from DOCX files
+@cache_data
+def extract_docx_data(docx_file):
+    try:
+        doc = docx.Document(docx_file)
+        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting DOCX content: {e}")
+        return ""
+# Function to extract text from plain text files
+@cache_data
+def extract_text_file_data(text_file):
+    try:
+        return text_file.read().decode("utf-8").strip()
+    except Exception as e:
+        logging.error(f"Error extracting TXT content: {e}")
+        return ""
+# Function to process extracted content (PDF, DOCX, etc.)
+def process_content(file_data, file_type, lang="eng"):
+    text = ""
+    images = []
+    if file_type == "pdf":
+        pdf_data = extract_pdf_data(file_data)
+        text = process_pdf_content(pdf_data)
+        images = pdf_data["images"]
+    elif file_type == "docx":
+        text = extract_docx_data(file_data)
+    elif file_type == "txt":
+        text = extract_text_file_data(file_data)
+    elif file_type in ["png", "jpg", "jpeg"]:
+        image = Image.open(file_data)
+        images.append(image)
+    ocr_text = extract_text_from_images(images, lang)
+    return text + "\n" + ocr_text
+# Function to process PDF content
+def process_pdf_content(pdf_data):
+    ocr_text = extract_text_from_images(pdf_data["images"])
+    combined_text = pdf_data["text"] + ocr_text
+    table_text = ""
+    for table in pdf_data["tables"]:
+        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
+        table_text += "\n".join(table_rows) + "\n"
+    return (combined_text + "\n" + table_text).strip()
+# Function to generate questions
+def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
+    prompt_template = f"""
+    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
+    Subject: {subject_name}
+    Instructor: {instructor}
+    Class: {class_name}
+    Institution: {institution}
+    Syllabus Content: {syllabus_context}
+    Difficulty Levels:
+    - Remember: {difficulty_level.get('Remember', 0)}
+    - Understand: {difficulty_level.get('Understand', 0)}
+    - Apply: {difficulty_level.get('Apply', 0)}
+    - Analyze: {difficulty_level.get('Analyze', 0)}
+    - Evaluate: {difficulty_level.get('Evaluate', 0)}
+    - Create: {difficulty_level.get('Create', 0)}
+    Format questions as follows:
+    Q1. ________________
+    Q2. ________________
+    ...
+    """
+    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
+    try:
+        return chain.invoke({})
+    except Exception as e:
+        logging.error(f"Error generating {question_type} questions: {e}")
+        return ""
+# Function to generate answers
+def generate_answers(questions, syllabus_context):
+    prompt = f"""
+    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
+    Syllabus Content: {syllabus_context}
+    Questions:
+    {questions}
+    Format answers as follows:
+    Answer 1: ________________
+    Answer 2: ________________
+    ...
+    """
+    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
+    try:
+        return chain.invoke({})
+    except Exception as e:
+        logging.error(f"Error generating answers: {e}")
+        return ""
+# Streamlit app
+st.title("Bloom's Taxonomy Based Exam Paper Developer")
+# Sidebar Clear Data Button
+if st.sidebar.button("Clear All Data"):
+    st.session_state.clear()
+    st.success("All data has been cleared. You can now upload a new syllabus.")
+# File Upload with Image Support
+uploaded_file = st.sidebar.file_uploader(
+    "Upload Syllabus (PDF, DOCX, TXT, Image)",
+    type=["pdf", "docx", "txt", "png", "jpg", "jpeg"]
+)
+# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
+subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
+instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
+class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
+institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
+# Language Option for OCR
+ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
+if uploaded_file:
+    # Clear session state when a new file is uploaded
+    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
+        st.session_state.clear()
+        st.success("Previous data cleared. Processing new file...")
+    st.session_state.uploaded_filename = uploaded_file.name
+    file_type = uploaded_file.type.split("/")[-1]
+    # Validate file type
+    if file_type not in ["pdf", "docx", "txt", "png", "jpg", "jpeg"]:
+        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
+    else:
+        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
+        st.session_state.syllabus_text = syllabus_text
+# Preview of Syllabus
+if "syllabus_text" in st.session_state:
+    st.subheader("Syllabus Preview:")
+    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
+else:
+    st.warning("Please upload a syllabus to begin.")
+# Question Type Selection
+question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
+difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
+difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
+num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
+if st.sidebar.button("Generate Questions"):
+    if "syllabus_text" in st.session_state:
+        with st.spinner(f"Generating {question_type}..."):
+            syllabus_context = st.session_state.syllabus_text
+            st.session_state.generated_questions = generate_questions(
+                question_type,
+                subject_name,
+                instructor_name,
+                class_name,
+                institution_name,
+                syllabus_context,
+                num_questions,
+                difficulty,
+            )
+        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
+    else:
+        st.error("Please upload a syllabus before generating questions.")
+# Button to generate answers for questions
+if st.sidebar.button("Generate Answers for Questions"):
+    if "generated_questions" in st.session_state:
+        with st.spinner("Generating answers..."):
+            syllabus_context = st.session_state.syllabus_text
+            st.session_state.generated_answers = generate_answers(
+                st.session_state.generated_questions, syllabus_context
+            )
+        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
+    else:
+        st.error("Generate questions first before generating answers.")
+# Download buttons for questions and answers
+if "generated_questions" in st.session_state:
+    st.sidebar.download_button(
+        label="Download Questions",
+        data=st.session_state.generated_questions,
+        file_name=f"{subject_name}_questions.txt",
+        mime="text/plain",
+    )
+if "generated_answers" in st.session_state:
+    st.sidebar.download_button(
+        label="Download Answers",
+        data=st.session_state.generated_answers,
+        file_name=f"{subject_name}_answers.txt",
+        mime="text/plain",
+    )
+# Enhanced footer for branding and information
+st.markdown("""
+---
+**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
+Built with ♥ to make exam preparation seamless.
+""")