Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from langchain_groq import ChatGroq | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from dotenv import load_dotenv | |
| import pytesseract | |
| from PIL import Image | |
| import pdfplumber | |
| import docx | |
| from io import BytesIO | |
| import logging | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| # Initialize LLM | |
| llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192") | |
| # OCR Configuration for Pytesseract | |
| pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path | |
| # Enhanced OCR with configurable language option | |
| def extract_text_from_images(images, lang="eng"): | |
| ocr_text = "" | |
| for image in images: | |
| try: | |
| ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n" | |
| except Exception as e: | |
| logging.error(f"Error in OCR: {e}") | |
| return ocr_text.strip() | |
| # Function to extract text, images, tables, and formulas from PDF | |
| def extract_pdf_data(pdf_path): | |
| data = {"text": "", "tables": [], "images": []} | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| data["text"] += page.extract_text() or "" | |
| tables = page.extract_tables() | |
| for table in tables: | |
| data["tables"].append(table) | |
| for image in page.images: | |
| base_image = pdf.extract_image(image["object_number"]) | |
| image_obj = Image.open(BytesIO(base_image["image"])) | |
| data["images"].append(image_obj) | |
| except Exception as e: | |
| logging.error(f"Error processing PDF: {e}") | |
| return data | |
| # Function to extract text from DOCX files | |
| def extract_docx_data(docx_file): | |
| try: | |
| doc = docx.Document(docx_file) | |
| text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()]) | |
| return text | |
| except Exception as e: | |
| logging.error(f"Error extracting DOCX content: {e}") | |
| return "" | |
| # Function to extract text from plain text files | |
| def extract_text_file_data(text_file): | |
| try: | |
| return text_file.read().decode("utf-8").strip() | |
| except Exception as e: | |
| logging.error(f"Error extracting TXT content: {e}") | |
| return "" | |
| # Function to process extracted content (PDF, DOCX, etc.) | |
| def process_content(file_data, file_type, lang="eng"): | |
| text = "" | |
| images = [] | |
| if file_type == "pdf": | |
| pdf_data = extract_pdf_data(file_data) | |
| text = process_pdf_content(pdf_data) | |
| images = pdf_data["images"] | |
| elif file_type == "docx": | |
| text = extract_docx_data(file_data) | |
| elif file_type == "txt": | |
| text = extract_text_file_data(file_data) | |
| elif file_type in ["png", "jpg", "jpeg"]: | |
| image = Image.open(file_data) | |
| images.append(image) | |
| ocr_text = extract_text_from_images(images, lang) | |
| return text + "\n" + ocr_text | |
| # Function to process PDF content | |
| def process_pdf_content(pdf_data): | |
| ocr_text = extract_text_from_images(pdf_data["images"]) | |
| combined_text = pdf_data["text"] + ocr_text | |
| table_text = "" | |
| for table in pdf_data["tables"]: | |
| table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table] | |
| table_text += "\n".join(table_rows) + "\n" | |
| return (combined_text + "\n" + table_text).strip() | |
| # Function to generate questions | |
| def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level): | |
| prompt_template = f""" | |
| Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content. | |
| Subject: {subject_name} | |
| Instructor: {instructor} | |
| Class: {class_name} | |
| Institution: {institution} | |
| Syllabus Content: {syllabus_context} | |
| Difficulty Levels: | |
| - Remember: {difficulty_level.get('Remember', 0)} | |
| - Understand: {difficulty_level.get('Understand', 0)} | |
| - Apply: {difficulty_level.get('Apply', 0)} | |
| - Analyze: {difficulty_level.get('Analyze', 0)} | |
| - Evaluate: {difficulty_level.get('Evaluate', 0)} | |
| - Create: {difficulty_level.get('Create', 0)} | |
| Format questions as follows: | |
| Q1. ________________ | |
| Q2. ________________ | |
| ... | |
| """ | |
| chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser()) | |
| try: | |
| return chain.invoke({}) | |
| except Exception as e: | |
| logging.error(f"Error generating {question_type} questions: {e}") | |
| return "" | |
| # Function to generate answers | |
| def generate_answers(questions, syllabus_context): | |
| prompt = f""" | |
| Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content. | |
| Syllabus Content: {syllabus_context} | |
| Questions: | |
| {questions} | |
| Format answers as follows: | |
| Answer 1: ________________ | |
| Answer 2: ________________ | |
| ... | |
| """ | |
| chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser()) | |
| try: | |
| return chain.invoke({}) | |
| except Exception as e: | |
| logging.error(f"Error generating answers: {e}") | |
| return "" | |
| # Streamlit app | |
| st.title("Bloom's Taxonomy Based Exam Paper Developer") | |
| # Sidebar Clear Data Button | |
| if st.sidebar.button("Clear All Data"): | |
| st.session_state.clear() | |
| st.success("All data has been cleared. You can now upload a new syllabus.") | |
| # Syllabus Upload with Automatic Clearing | |
| uploaded_file = st.sidebar.file_uploader( | |
| "Upload Syllabus (PDF, DOCX, TXT, Image)", | |
| type=["pdf", "docx", "txt", "png", "jpg"] | |
| ) | |
| # Sidebar Inputs for Subject Name, Instructor, Class, and Institution | |
| subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name") | |
| instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name") | |
| class_name = st.sidebar.text_input("Enter Class Name", "Class Name") | |
| institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name") | |
| # Language Option for OCR | |
| ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"]) | |
| if uploaded_file: | |
| # Clear session state when a new file is uploaded | |
| if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name: | |
| st.session_state.clear() | |
| st.success("Previous data cleared. Processing new file...") | |
| st.session_state.uploaded_filename = uploaded_file.name | |
| file_type = uploaded_file.type.split("/")[-1] | |
| # Validate file type | |
| if file_type not in ["pdf", "docx", "txt", "png", "jpg"]: | |
| st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.") | |
| else: | |
| syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang) | |
| st.session_state.syllabus_text = syllabus_text | |
| # Preview of Syllabus | |
| if "syllabus_text" in st.session_state: | |
| st.subheader("Syllabus Preview:") | |
| st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300) | |
| else: | |
| st.warning("Please upload a syllabus to begin.") | |
| # Question Type Selection | |
| question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based")) | |
| difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"] | |
| difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels} | |
| num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10) | |
| if st.sidebar.button("Generate Questions"): | |
| if "syllabus_text" in st.session_state: | |
| with st.spinner(f"Generating {question_type}..."): | |
| syllabus_context = st.session_state.syllabus_text | |
| st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty) | |
| st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400) | |
| else: | |
| st.error("Please upload a syllabus before generating questions.") | |
| if st.sidebar.button("Generate Answers for Questions"): | |
| if "generated_questions" in st.session_state: | |
| with st.spinner("Generating answers..."): | |
| syllabus_context = st.session_state.syllabus_text | |
| st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context) | |
| st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400) | |
| else: | |
| st.error("Generate questions first before generating answers.") | |
| if "generated_questions" in st.session_state: | |
| st.sidebar.download_button( | |
| label="Download Questions", | |
| data=st.session_state.generated_questions, | |
| file_name=f"{subject_name}_questions.txt", | |
| mime="text/plain", | |
| ) | |
| if "generated_answers" in st.session_state: | |
| st.sidebar.download_button( | |
| label="Download Answers", | |
| data=st.session_state.generated_answers, | |
| file_name=f"{subject_name}_answers.txt", | |
| mime="text/plain", | |
| ) | |
| st.markdown(""" | |
| --- | |
| **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit. | |
| """) | |