Spaces:

ahm14
/

Exam_Developer

Sleeping

File size: 9,531 Bytes

efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
efbebca
 
 
 
 
7337e1e
efbebca
dc17fdf
 
 
 
 
 
 
 
 
 
efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
 
 
 
 
 
 
efbebca
 
 
7337e1e
 
 
 
 
efbebca
 
dc17fdf
efbebca
 
 
 
 
 
 
 
 
 
7337e1e
 
 
efbebca
dc17fdf
efbebca
 
 
 
 
 
 
 
 
7337e1e
efbebca
 
7337e1e
efbebca
 
7337e1e
efbebca
 
 
 
7337e1e
 
 
efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efbebca
dc17fdf
 
 
efbebca
7337e1e
 
 
 
 
 
 
 
 
 
 
 
dc17fdf
7337e1e
efbebca
 
7337e1e
efbebca
7337e1e
 
 
efbebca
 
 
 
 
 
 
 
7337e1e
efbebca
7337e1e
 
efbebca
 
 
 
 
7337e1e
efbebca
7337e1e
efbebca
 
 
 
 
7337e1e
efbebca
 
 
 
 
 
 
7337e1e
efbebca
 
 
 
 
 
 
7337e1e
 
 
efbebca

import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pytesseract
from PIL import Image
import pdfplumber
import docx
from io import BytesIO
import logging

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize LLM
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")

# OCR Configuration for Pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path

# Enhanced OCR with configurable language option
def extract_text_from_images(images, lang="eng"):
    ocr_text = ""
    for image in images:
        try:
            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
        except Exception as e:
            logging.error(f"Error in OCR: {e}")
    return ocr_text.strip()

# Function to extract text, images, tables, and formulas from PDF
def extract_pdf_data(pdf_path):
    data = {"text": "", "tables": [], "images": []}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                data["text"] += page.extract_text() or ""
                tables = page.extract_tables()
                for table in tables:
                    data["tables"].append(table)
                for image in page.images:
                    base_image = pdf.extract_image(image["object_number"])
                    image_obj = Image.open(BytesIO(base_image["image"]))
                    data["images"].append(image_obj)
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
    return data

# Function to extract text from DOCX files
def extract_docx_data(docx_file):
    try:
        doc = docx.Document(docx_file)
        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
        return text
    except Exception as e:
        logging.error(f"Error extracting DOCX content: {e}")
        return ""

# Function to extract text from plain text files
def extract_text_file_data(text_file):
    try:
        return text_file.read().decode("utf-8").strip()
    except Exception as e:
        logging.error(f"Error extracting TXT content: {e}")
        return ""

# Function to process extracted content (PDF, DOCX, etc.)
def process_content(file_data, file_type, lang="eng"):
    text = ""
    images = []
    if file_type == "pdf":
        pdf_data = extract_pdf_data(file_data)
        text = process_pdf_content(pdf_data)
        images = pdf_data["images"]
    elif file_type == "docx":
        text = extract_docx_data(file_data)
    elif file_type == "txt":
        text = extract_text_file_data(file_data)
    elif file_type in ["png", "jpg", "jpeg"]:
        image = Image.open(file_data)
        images.append(image)

    ocr_text = extract_text_from_images(images, lang)
    return text + "\n" + ocr_text

# Function to process PDF content
def process_pdf_content(pdf_data):
    ocr_text = extract_text_from_images(pdf_data["images"])
    combined_text = pdf_data["text"] + ocr_text

    table_text = ""
    for table in pdf_data["tables"]:
        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
        table_text += "\n".join(table_rows) + "\n"

    return (combined_text + "\n" + table_text).strip()

# Function to generate questions
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
    prompt_template = f"""
    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.

    Subject: {subject_name}
    Instructor: {instructor}
    Class: {class_name}
    Institution: {institution}
    Syllabus Content: {syllabus_context}

    Difficulty Levels:
    - Remember: {difficulty_level.get('Remember', 0)}
    - Understand: {difficulty_level.get('Understand', 0)}
    - Apply: {difficulty_level.get('Apply', 0)}
    - Analyze: {difficulty_level.get('Analyze', 0)}
    - Evaluate: {difficulty_level.get('Evaluate', 0)}
    - Create: {difficulty_level.get('Create', 0)}

    Format questions as follows:
    Q1. ________________

    Q2. ________________

    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating {question_type} questions: {e}")
        return ""

# Function to generate answers
def generate_answers(questions, syllabus_context):
    prompt = f"""
    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.

    Syllabus Content: {syllabus_context}

    Questions:
    {questions}

    Format answers as follows:
    Answer 1: ________________
    Answer 2: ________________
    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating answers: {e}")
        return ""

# Streamlit app
st.title("Bloom's Taxonomy Based Exam Paper Developer")

# Sidebar Clear Data Button
if st.sidebar.button("Clear All Data"):
    st.session_state.clear()
    st.success("All data has been cleared. You can now upload a new syllabus.")

# Syllabus Upload with Automatic Clearing
uploaded_file = st.sidebar.file_uploader(
    "Upload Syllabus (PDF, DOCX, TXT, Image)",
    type=["pdf", "docx", "txt", "png", "jpg"]
)

# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")

# Language Option for OCR
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])

if uploaded_file:
    # Clear session state when a new file is uploaded
    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
        st.session_state.clear()
        st.success("Previous data cleared. Processing new file...")

    st.session_state.uploaded_filename = uploaded_file.name
    file_type = uploaded_file.type.split("/")[-1]

    # Validate file type
    if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
    else:
        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
        st.session_state.syllabus_text = syllabus_text

# Preview of Syllabus
if "syllabus_text" in st.session_state:
    st.subheader("Syllabus Preview:")
    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
else:
    st.warning("Please upload a syllabus to begin.")

# Question Type Selection
question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)

if st.sidebar.button("Generate Questions"):
    if "syllabus_text" in st.session_state:
        with st.spinner(f"Generating {question_type}..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
    else:
        st.error("Please upload a syllabus before generating questions.")

if st.sidebar.button("Generate Answers for Questions"):
    if "generated_questions" in st.session_state:
        with st.spinner("Generating answers..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
    else:
        st.error("Generate questions first before generating answers.")

if "generated_questions" in st.session_state:
    st.sidebar.download_button(
        label="Download Questions",
        data=st.session_state.generated_questions,
        file_name=f"{subject_name}_questions.txt",
        mime="text/plain",
    )

if "generated_answers" in st.session_state:
    st.sidebar.download_button(
        label="Download Answers",
        data=st.session_state.generated_answers,
        file_name=f"{subject_name}_answers.txt",
        mime="text/plain",
    )

st.markdown(""" 
--- 
**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit. 
""")