Spaces:
Build error
Build error
| import streamlit as st | |
| from langchain_groq import ChatGroq | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from dotenv import load_dotenv | |
| import pytesseract | |
| from PIL import Image | |
| import pdfplumber | |
| import docx | |
| from io import BytesIO | |
| import logging | |
| from docx import Document | |
| from fpdf import FPDF | |
| import cv2 | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import re | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| # Initialize LLM | |
| llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192") | |
| # OCR Configuration for Pytesseract | |
| pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path | |
| # Enhanced OCR with configurable language option and multi-image support | |
| def extract_text_from_images(images, lang="eng"): | |
| ocr_text = "" | |
| formulas = [] | |
| for image in images: | |
| try: | |
| # Extract text | |
| ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n" | |
| # Extract formulas (simple heuristic for LaTeX-style formulas) | |
| extracted_formula = pytesseract.image_to_string(image, config='--psm 6') | |
| formulas += re.findall(r'\$.*?\$', extracted_formula) | |
| except Exception as e: | |
| logging.error(f"Error in OCR: {e}") | |
| return ocr_text.strip(), formulas | |
| # Function to extract formulas using Tesseract OCR | |
| def extract_formula_using_tesseract(image_path): | |
| image = Image.open(image_path) | |
| gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY) | |
| _, thresh_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY_INV) | |
| custom_oem_psm_config = r'--oem 3 --psm 6' # PSM 6 is used for block text | |
| extracted_text = pytesseract.image_to_string(thresh_image, config=custom_oem_psm_config) | |
| return extracted_text | |
| # Function to extract text, images, tables, and formulas from PDF | |
| def extract_pdf_data(pdf_path): | |
| data = {"text": "", "tables": [], "images": [], "formulas": []} | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| data["text"] += page.extract_text() or "" | |
| tables = page.extract_tables() | |
| for table in tables: | |
| data["tables"].append(table) | |
| for image in page.images: | |
| base_image = pdf.extract_image(image["object_number"]) | |
| image_obj = Image.open(BytesIO(base_image["image"])) | |
| data["images"].append(image_obj) | |
| # Extract formulas from images (OCR) | |
| extracted_text = extract_formula_using_tesseract(image_obj) | |
| if extracted_text: | |
| data["formulas"].append(extracted_text) | |
| except Exception as e: | |
| logging.error(f"Error processing PDF: {e}") | |
| return data | |
| # Function to extract text from DOCX files | |
| def extract_docx_data(docx_file): | |
| try: | |
| doc = docx.Document(docx_file) | |
| text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()]) | |
| formulas = [] | |
| # Search for formulas in the text | |
| for para in doc.paragraphs: | |
| if '$' in para.text: # Simple LaTeX style formula detection | |
| formulas.append(para.text.strip()) | |
| return text, formulas | |
| except Exception as e: | |
| logging.error(f"Error extracting DOCX content: {e}") | |
| return "", [] | |
| # Function to extract text from plain text files | |
| def extract_text_file_data(text_file): | |
| try: | |
| return text_file.read().decode("utf-8").strip() | |
| except Exception as e: | |
| logging.error(f"Error extracting TXT content: {e}") | |
| return "" | |
| # Function to process extracted content (PDF, DOCX, etc.) | |
| def process_content(file_data, file_type, lang="eng"): | |
| text = "" | |
| images = [] | |
| formulas = [] | |
| if file_type == "pdf": | |
| pdf_data = extract_pdf_data(file_data) | |
| text = process_pdf_content(pdf_data) | |
| images = pdf_data["images"] | |
| formulas = pdf_data["formulas"] | |
| elif file_type == "docx": | |
| text, formulas = extract_docx_data(file_data) | |
| elif file_type == "txt": | |
| text = extract_text_file_data(file_data) | |
| elif file_type in ["png", "jpg", "jpeg"]: | |
| image = Image.open(file_data) | |
| images.append(image) | |
| ocr_text, image_formulas = extract_text_from_images(images, lang) | |
| formulas += image_formulas | |
| ocr_text, image_formulas = extract_text_from_images(images, lang) | |
| formulas += image_formulas | |
| return text + "\n" + ocr_text + "\n" + "\n".join(formulas) | |
| # Function to process PDF content | |
| def process_pdf_content(pdf_data): | |
| ocr_text, _ = extract_text_from_images(pdf_data["images"]) # Unpack the tuple | |
| combined_text = pdf_data["text"] + ocr_text # Concatenate strings | |
| table_text = "" | |
| for table in pdf_data["tables"]: | |
| table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table] | |
| table_text += "\n".join(table_rows) + "\n" | |
| return (combined_text + "\n" + table_text).strip() | |
| # Function to generate questions with graphs and formulas | |
| def generate_questions_with_graphs_and_formulas(syllabus_content, num_questions, subject_name, difficulty_level): | |
| prompt_template = f""" | |
| Generate {num_questions} questions based on the syllabus content below. | |
| Some questions should include graphs, charts, or LaTeX equations where applicable. | |
| Subject: {subject_name} | |
| Difficulty Levels: {difficulty_level} | |
| Syllabus Content: {syllabus_content} | |
| Format: | |
| - Question 1: Text with equation/graph | |
| """ | |
| chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser()) | |
| try: | |
| return chain.invoke({}) | |
| except Exception as e: | |
| logging.error(f"Error generating questions with graphs and formulas: {e}") | |
| return "" | |
| # Function to generate bar chart for example | |
| def generate_bar_chart(data, title="Graph"): | |
| plt.figure(figsize=(5, 4)) | |
| plt.bar(data.keys(), data.values()) | |
| plt.title(title) | |
| plt.xlabel("X-axis") | |
| plt.ylabel("Y-axis") | |
| plt.tight_layout() | |
| buffer = BytesIO() | |
| plt.savefig(buffer, format="png") | |
| buffer.seek(0) | |
| plt.close() | |
| return buffer | |
| # Function to render LaTeX formulas in Streamlit | |
| def render_latex_formula(formula): | |
| st.markdown(f"$$ {formula} $$") | |
| # Function to generate answers | |
| def generate_answers(questions, syllabus_context): | |
| prompt = f""" | |
| Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content. | |
| Syllabus Content: {syllabus_context} | |
| Questions: | |
| {questions} | |
| Format answers as follows: | |
| Answer 1: ________________ | |
| Answer 2: ________________ | |
| ... | |
| """ | |
| chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser()) | |
| try: | |
| return chain.invoke({}) | |
| except Exception as e: | |
| logging.error(f"Error generating answers: {e}") | |
| return "" | |
| # Function to download as DOCX | |
| def download_as_docx(content, file_name="output.docx"): | |
| doc = Document() | |
| for line in content.split("\n"): | |
| doc.add_paragraph(line) | |
| buffer = BytesIO() | |
| doc.save(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| # Function to download as PDF with charts and LaTeX formulas | |
| def download_as_pdf_with_graphs_and_formulas(content, chart_buffers=None, latex_formulas=None, file_name="output.pdf"): | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.set_font("Arial", size=12) | |
| # Add content (questions/answers) | |
| for line in content.split("\n"): | |
| pdf.cell(200, 10, txt=line, ln=True) | |
| # Insert chart images | |
| if chart_buffers: | |
| for buffer in chart_buffers: | |
| pdf.image(buffer, x=10, y=pdf.get_y(), w=180) # Adjust coordinates and image size as needed | |
| pdf.ln(50) # Add space for the next content | |
| # Insert LaTeX formula placeholders | |
| if latex_formulas: | |
| for formula in latex_formulas: | |
| pdf.multi_cell(200, 10, txt=f"Formula: {formula}", ln=True) | |
| # Save the buffer to memory | |
| buffer = BytesIO() | |
| pdf.output(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| # Streamlit app with enhanced UI and multi-image upload support | |
| st.title("Bloom's Taxonomy Based Exam Paper Developer") | |
| st.markdown(""" | |
| ### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles. | |
| """) | |
| # Sidebar Clear Data Button | |
| if st.sidebar.button("Clear All Data"): | |
| st.session_state.clear() | |
| st.success("All data has been cleared. You can now upload a new syllabus.") | |
| # Upload Syllabus and Multiple Images | |
| uploaded_file = st.sidebar.file_uploader( | |
| "Upload Syllabus (PDF, DOCX, TXT)", | |
| type=["pdf", "docx", "txt"] | |
| ) | |
| uploaded_images = st.sidebar.file_uploader( | |
| "Upload Supplementary Images (PNG, JPG, JPEG)", | |
| type=["png", "jpg", "jpeg"], | |
| accept_multiple_files=True | |
| ) | |
| # Sidebar Inputs for Subject Name, Instructor, Class, and Institution | |
| subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name") | |
| instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name") | |
| class_name = st.sidebar.text_input("Enter Class Name", "Class Name") | |
| institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name") | |
| # Difficulty Selection | |
| difficulty_level = st.sidebar.radio("Select Difficulty Level", ("Easy", "Medium", "Hard")) | |
| # Handle file uploads and process them | |
| if uploaded_file is not None: | |
| file_data = uploaded_file.read() | |
| file_type = uploaded_file.type.split("/")[1].lower() | |
| syllabus_content = process_content(file_data, file_type) | |
| st.session_state.syllabus_text = syllabus_content | |
| st.success("Syllabus content loaded successfully!") | |
| # Generate Exam Paper with Graphs and Formulas | |
| num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=20, value=5) | |
| if st.sidebar.button("Generate Exam Paper"): | |
| questions = generate_questions_with_graphs_and_formulas( | |
| syllabus_content=st.session_state.syllabus_text, | |
| num_questions=num_questions, | |
| subject_name=subject_name, | |
| difficulty_level=difficulty_level | |
| ) | |
| # Display generated questions | |
| st.session_state.generated_questions = questions | |
| st.markdown("### Generated Exam Questions") | |
| st.text_area("Exam Questions", questions, height=400) | |
| # Download Options | |
| if "generated_questions" in st.session_state: | |
| download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"]) | |
| if download_choice == "DOCX": | |
| download_buffer = download_as_docx(st.session_state.generated_questions) | |
| st.download_button("Download DOCX", download_buffer, file_name="exam_questions.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document") | |
| elif download_choice == "PDF": | |
| chart_buffer = generate_bar_chart({"Math": 80, "Science": 70, "English": 90}) | |
| latex_formula = r"\frac{d}{dx} \sin(x) = \cos(x)" | |
| download_buffer = download_as_pdf_with_graphs_and_formulas( | |
| st.session_state.generated_questions, | |
| chart_buffers=[chart_buffer], | |
| latex_formulas=[latex_formula] | |
| ) | |
| st.download_button("Download PDF", download_buffer, file_name="exam_questions.pdf", mime="application/pdf") | |
| elif download_choice == "TXT": | |
| st.download_button("Download TXT", st.session_state.generated_questions, file_name="exam_questions.txt", mime="text/plain") | |