File size: 9,531 Bytes
efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
efbebca
 
 
 
 
7337e1e
efbebca
dc17fdf
 
 
 
 
 
 
 
 
 
efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
 
 
 
 
 
 
efbebca
 
 
7337e1e
 
 
 
 
efbebca
 
dc17fdf
efbebca
 
 
 
 
 
 
 
 
 
7337e1e
 
 
efbebca
dc17fdf
efbebca
 
 
 
 
 
 
 
 
7337e1e
efbebca
 
7337e1e
efbebca
 
7337e1e
efbebca
 
 
 
7337e1e
 
 
efbebca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7337e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efbebca
dc17fdf
 
 
efbebca
7337e1e
 
 
 
 
 
 
 
 
 
 
 
dc17fdf
7337e1e
efbebca
 
7337e1e
efbebca
7337e1e
 
 
efbebca
 
 
 
 
 
 
 
7337e1e
efbebca
7337e1e
 
efbebca
 
 
 
 
7337e1e
efbebca
7337e1e
efbebca
 
 
 
 
7337e1e
efbebca
 
 
 
 
 
 
7337e1e
efbebca
 
 
 
 
 
 
7337e1e
 
 
efbebca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pytesseract
from PIL import Image
import pdfplumber
import docx
from io import BytesIO
import logging

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize LLM
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")

# OCR Configuration for Pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path

# Enhanced OCR with configurable language option
def extract_text_from_images(images, lang="eng"):
    ocr_text = ""
    for image in images:
        try:
            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
        except Exception as e:
            logging.error(f"Error in OCR: {e}")
    return ocr_text.strip()

# Function to extract text, images, tables, and formulas from PDF
def extract_pdf_data(pdf_path):
    data = {"text": "", "tables": [], "images": []}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                data["text"] += page.extract_text() or ""
                tables = page.extract_tables()
                for table in tables:
                    data["tables"].append(table)
                for image in page.images:
                    base_image = pdf.extract_image(image["object_number"])
                    image_obj = Image.open(BytesIO(base_image["image"]))
                    data["images"].append(image_obj)
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
    return data

# Function to extract text from DOCX files
def extract_docx_data(docx_file):
    try:
        doc = docx.Document(docx_file)
        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
        return text
    except Exception as e:
        logging.error(f"Error extracting DOCX content: {e}")
        return ""

# Function to extract text from plain text files
def extract_text_file_data(text_file):
    try:
        return text_file.read().decode("utf-8").strip()
    except Exception as e:
        logging.error(f"Error extracting TXT content: {e}")
        return ""

# Function to process extracted content (PDF, DOCX, etc.)
def process_content(file_data, file_type, lang="eng"):
    text = ""
    images = []
    if file_type == "pdf":
        pdf_data = extract_pdf_data(file_data)
        text = process_pdf_content(pdf_data)
        images = pdf_data["images"]
    elif file_type == "docx":
        text = extract_docx_data(file_data)
    elif file_type == "txt":
        text = extract_text_file_data(file_data)
    elif file_type in ["png", "jpg", "jpeg"]:
        image = Image.open(file_data)
        images.append(image)

    ocr_text = extract_text_from_images(images, lang)
    return text + "\n" + ocr_text

# Function to process PDF content
def process_pdf_content(pdf_data):
    ocr_text = extract_text_from_images(pdf_data["images"])
    combined_text = pdf_data["text"] + ocr_text

    table_text = ""
    for table in pdf_data["tables"]:
        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
        table_text += "\n".join(table_rows) + "\n"

    return (combined_text + "\n" + table_text).strip()

# Function to generate questions
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
    prompt_template = f"""
    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.

    Subject: {subject_name}
    Instructor: {instructor}
    Class: {class_name}
    Institution: {institution}
    Syllabus Content: {syllabus_context}

    Difficulty Levels:
    - Remember: {difficulty_level.get('Remember', 0)}
    - Understand: {difficulty_level.get('Understand', 0)}
    - Apply: {difficulty_level.get('Apply', 0)}
    - Analyze: {difficulty_level.get('Analyze', 0)}
    - Evaluate: {difficulty_level.get('Evaluate', 0)}
    - Create: {difficulty_level.get('Create', 0)}

    Format questions as follows:
    Q1. ________________

    Q2. ________________

    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating {question_type} questions: {e}")
        return ""

# Function to generate answers
def generate_answers(questions, syllabus_context):
    prompt = f"""
    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.

    Syllabus Content: {syllabus_context}

    Questions:
    {questions}

    Format answers as follows:
    Answer 1: ________________
    Answer 2: ________________
    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating answers: {e}")
        return ""

# Streamlit app
st.title("Bloom's Taxonomy Based Exam Paper Developer")

# Sidebar Clear Data Button
if st.sidebar.button("Clear All Data"):
    st.session_state.clear()
    st.success("All data has been cleared. You can now upload a new syllabus.")

# Syllabus Upload with Automatic Clearing
uploaded_file = st.sidebar.file_uploader(
    "Upload Syllabus (PDF, DOCX, TXT, Image)",
    type=["pdf", "docx", "txt", "png", "jpg"]
)

# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")

# Language Option for OCR
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])

if uploaded_file:
    # Clear session state when a new file is uploaded
    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
        st.session_state.clear()
        st.success("Previous data cleared. Processing new file...")

    st.session_state.uploaded_filename = uploaded_file.name
    file_type = uploaded_file.type.split("/")[-1]

    # Validate file type
    if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
    else:
        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
        st.session_state.syllabus_text = syllabus_text

# Preview of Syllabus
if "syllabus_text" in st.session_state:
    st.subheader("Syllabus Preview:")
    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
else:
    st.warning("Please upload a syllabus to begin.")

# Question Type Selection
question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)

if st.sidebar.button("Generate Questions"):
    if "syllabus_text" in st.session_state:
        with st.spinner(f"Generating {question_type}..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
    else:
        st.error("Please upload a syllabus before generating questions.")

if st.sidebar.button("Generate Answers for Questions"):
    if "generated_questions" in st.session_state:
        with st.spinner("Generating answers..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
    else:
        st.error("Generate questions first before generating answers.")

if "generated_questions" in st.session_state:
    st.sidebar.download_button(
        label="Download Questions",
        data=st.session_state.generated_questions,
        file_name=f"{subject_name}_questions.txt",
        mime="text/plain",
    )

if "generated_answers" in st.session_state:
    st.sidebar.download_button(
        label="Download Answers",
        data=st.session_state.generated_answers,
        file_name=f"{subject_name}_answers.txt",
        mime="text/plain",
    )

st.markdown(""" 
--- 
**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit. 
""")