Spaces:

ahm14
/

Exam_maths

Sleeping

App Files Files Community

Exam_maths / app.py

ahm14

Rename final_test3.py to app.py

6c63823 verified about 1 year ago

raw

history blame contribute delete

12.8 kB

	import streamlit as st
	from langchain_groq import ChatGroq
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.prompts import ChatPromptTemplate
	from dotenv import load_dotenv
	import pytesseract
	from PIL import Image
	import pdfplumber
	import docx
	from io import BytesIO
	import logging
	from docx import Document
	from fpdf import FPDF
	import cv2
	import numpy as np

	# Load environment variables
	load_dotenv()

	# Initialize logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

	# Initialize LLM
	llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")

	# OCR Configuration for Pytesseract
	pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path

	# Enhanced OCR with configurable language option and multi-image support
	def extract_text_from_images(images, lang="eng"):
	ocr_text = ""
	for image in images:
	try:
	ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
	except Exception as e:
	logging.error(f"Error in OCR: {e}")
	return ocr_text.strip()

	# Function to extract formulas using Tesseract OCR
	def extract_formula_using_tesseract(image_path):
	# Open image
	image = Image.open(image_path)

	# Convert image to grayscale
	gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)

	# Apply thresholding to improve accuracy for formulas
	_, thresh_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY_INV)

	# Use pytesseract to extract LaTeX formula
	custom_oem_psm_config = r'--oem 3 --psm 6' # PSM 6 is used for block text
	extracted_text = pytesseract.image_to_string(thresh_image, config=custom_oem_psm_config)

	return extracted_text

	# Function to extract text, images, tables, and formulas from PDF
	def extract_pdf_data(pdf_path):
	data = {"text": "", "tables": [], "images": []}
	try:
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	data["text"] += page.extract_text() or ""
	tables = page.extract_tables()
	for table in tables:
	data["tables"].append(table)
	for image in page.images:
	base_image = pdf.extract_image(image["object_number"])
	image_obj = Image.open(BytesIO(base_image["image"]))
	data["images"].append(image_obj)
	except Exception as e:
	logging.error(f"Error processing PDF: {e}")
	return data

	# Function to extract text from DOCX files
	def extract_docx_data(docx_file):
	try:
	doc = docx.Document(docx_file)
	text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
	return text
	except Exception as e:
	logging.error(f"Error extracting DOCX content: {e}")
	return ""

	# Function to extract text from plain text files
	def extract_text_file_data(text_file):
	try:
	return text_file.read().decode("utf-8").strip()
	except Exception as e:
	logging.error(f"Error extracting TXT content: {e}")
	return ""

	# Function to process extracted content (PDF, DOCX, etc.)
	def process_content(file_data, file_type, lang="eng"):
	text = ""
	images = []
	formulas = ""
	if file_type == "pdf":
	pdf_data = extract_pdf_data(file_data)
	text = process_pdf_content(pdf_data)
	images = pdf_data["images"]
	elif file_type == "docx":
	text = extract_docx_data(file_data)
	elif file_type == "txt":
	text = extract_text_file_data(file_data)
	elif file_type in ["png", "jpg", "jpeg"]:
	image = Image.open(file_data)
	images.append(image)

	# Extract OCR text and formulas from images
	ocr_text = extract_text_from_images(images, lang)
	formulas = ""
	for image in images:
	formulas += extract_formula_using_tesseract(image) + "\n"

	return text + "\n" + ocr_text + "\n" + formulas

	# Function to process PDF content
	def process_pdf_content(pdf_data):
	ocr_text = extract_text_from_images(pdf_data["images"])
	combined_text = pdf_data["text"] + ocr_text

	table_text = ""
	for table in pdf_data["tables"]:
	table_rows = [" \| ".join(str(cell) if cell else "" for cell in row) for row in table]
	table_text += "\n".join(table_rows) + "\n"

	return (combined_text + "\n" + table_text).strip()

	# Function to generate questions
	def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
	prompt_template = f"""
	Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
	Subject: {subject_name}
	Instructor: {instructor}
	Class: {class_name}
	Institution: {institution}
	Syllabus Content: {syllabus_context}
	Difficulty Levels:
	- Remember: {difficulty_level.get('Remember', 0)}
	- Understand: {difficulty_level.get('Understand', 0)}
	- Apply: {difficulty_level.get('Apply', 0)}
	- Analyze: {difficulty_level.get('Analyze', 0)}
	- Evaluate: {difficulty_level.get('Evaluate', 0)}
	- Create: {difficulty_level.get('Create', 0)}
	Format questions as follows:
	Q1. ________________
	Q2. ________________
	...
	"""
	chain = (ChatPromptTemplate.from_template(prompt_template) \| llm \| StrOutputParser())
	try:
	return chain.invoke({})
	except Exception as e:
	logging.error(f"Error generating {question_type} questions: {e}")
	return ""

	# Function to generate answers
	def generate_answers(questions, syllabus_context):
	prompt = f"""
	Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
	Syllabus Content: {syllabus_context}
	Questions:
	{questions}
	Format answers as follows:
	Answer 1: ________________
	Answer 2: ________________
	...
	"""
	chain = (ChatPromptTemplate.from_template(prompt) \| llm \| StrOutputParser())
	try:
	return chain.invoke({})
	except Exception as e:
	logging.error(f"Error generating answers: {e}")
	return ""

	# Function to download as DOCX
	def download_as_docx(content, file_name="output.docx"):
	doc = Document()
	for line in content.split("\n"):
	doc.add_paragraph(line)
	buffer = BytesIO()
	doc.save(buffer)
	buffer.seek(0)
	return buffer

	# Function to download as PDF
	def download_as_pdf(content, file_name="output.pdf"):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)
	for line in content.split("\n"):
	pdf.cell(200, 10, txt=line, ln=True)
	buffer = BytesIO()
	pdf.output(buffer)
	buffer.seek(0)
	return buffer

	# Streamlit app with enhanced UI and multi-image upload support
	st.title("Bloom's Taxonomy Based Exam Paper Developer")
	st.markdown("""
	### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles.
	""")

	# Sidebar Clear Data Button
	if st.sidebar.button("Clear All Data"):
	st.session_state.clear()
	st.success("All data has been cleared. You can now upload a new syllabus.")

	# Upload Syllabus and Multiple Images
	uploaded_file = st.sidebar.file_uploader(
	"Upload Syllabus (PDF, DOCX, TXT)",
	type=["pdf", "docx", "txt"]
	)

	uploaded_images = st.sidebar.file_uploader(
	"Upload Supplementary Images (PNG, JPG, JPEG)",
	type=["png", "jpg", "jpeg"],
	accept_multiple_files=True
	)

	# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
	subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
	instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
	class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
	institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")

	# Language Option for OCR
	ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])

	# Process uploaded file and images
	if uploaded_file or uploaded_images:
	# Clear session state when new files are uploaded
	if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
	st.session_state.clear()
	st.success("Previous data cleared. Processing new file...")

	st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None

	# Process syllabus file
	if uploaded_file:
	file_type = uploaded_file.type.split("/")[-1]
	if file_type in ["pdf", "docx", "txt"]:
	syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
	st.session_state.syllabus_text = syllabus_text
	else:
	st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.")

	# Process images
	if uploaded_images:
	image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang)
	st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text

	# Preview of Syllabus
	if "syllabus_text" in st.session_state:
	st.markdown("### Preview of Extracted Syllabus Content")
	st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300)

	# Inputs for Question Generation
	if "syllabus_text" in st.session_state:
	st.markdown("### Generate Questions")
	question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"])
	num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10)
	difficulty_levels = {
	"Remember": st.slider("Remember (%)", 0, 100, 20),
	"Understand": st.slider("Understand (%)", 0, 100, 20),
	"Apply": st.slider("Apply (%)", 0, 100, 20),
	"Analyze": st.slider("Analyze (%)", 0, 100, 20),
	"Evaluate": st.slider("Evaluate (%)", 0, 100, 10),
	"Create": st.slider("Create (%)", 0, 100, 10),
	}

	if st.button("Generate Questions"):
	with st.spinner("Generating questions..."):
	questions = generate_questions(
	question_type,
	subject_name,
	instructor_name,
	class_name,
	institution_name,
	st.session_state.syllabus_text,
	num_questions,
	difficulty_levels,
	)
	st.session_state.generated_questions = questions
	st.success("Questions generated successfully!")

	# Display Generated Questions
	if "generated_questions" in st.session_state:
	st.markdown("### Generated Questions")
	st.text_area("Questions", st.session_state.generated_questions, height=300)

	if st.button("Generate Answers"):
	with st.spinner("Generating answers..."):
	answers = generate_answers(
	st.session_state.generated_questions,
	st.session_state.syllabus_text,
	)
	st.session_state.generated_answers = answers
	st.success("Answers generated successfully!")

	# Display Generated Answers
	if "generated_answers" in st.session_state:
	st.markdown("### Generated Answers")
	st.text_area("Answers", st.session_state.generated_answers, height=300)

	# Download Options
	if "generated_questions" in st.session_state or "generated_answers" in st.session_state:
	st.markdown("### Download Options")
	download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"])

	content_to_download = ""
	if "generated_questions" in st.session_state:
	content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n"
	if "generated_answers" in st.session_state:
	content_to_download += "Generated Answers:\n" + st.session_state.generated_answers + "\n\n"

	if download_choice == "DOCX":
	download_buffer = download_as_docx(content_to_download)
	st.download_button("Download DOCX", download_buffer, file_name="exam_questions_and_answers.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
	elif download_choice == "PDF":
	download_buffer = download_as_pdf(content_to_download)
	st.download_button("Download PDF", download_buffer, file_name="exam_questions_and_answers.pdf", mime="application/pdf")
	elif download_choice == "TXT":
	st.download_button("Download TXT", content_to_download, file_name="exam_questions_and_answers.txt", mime="text/plain")