Spaces:

ahm14
/

Advanced_Exam_Generator

Build error

App Files Files Community

ahm14 commited on Jan 13, 2025

Commit

d947799

verified ·

1 Parent(s): e36e1ee

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -194

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
@@ -11,6 +12,8 @@ from io import BytesIO
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from streamlit.runtime.caching import cache_data
 # Load environment variables
 load_dotenv()
@@ -21,10 +24,10 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
 # Initialize LLM
 llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
-# OCR Configuration for Pytesseract
-pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
-# Enhanced OCR with configurable language option
 def extract_text_from_images(images, lang="eng"):
     ocr_text = ""
     for image in images:
@@ -34,35 +37,23 @@ def extract_text_from_images(images, lang="eng"):
             logging.error(f"Error in OCR: {e}")
     return ocr_text.strip()
-# Function to extract data from PDF with parallelization
 @cache_data
-def extract_pdf_data(pdf_path):
-    data = {"text": "", "tables": [], "images": []}
     try:
-        with pdfplumber.open(pdf_path) as pdf:
-            def process_page(page):
-                page_data = {"text": page.extract_text() or "", "tables": [], "images": []}
-                tables = page.extract_tables()
-                for table in tables:
-                    page_data["tables"].append(table)
-                for image in page.images:
-                    base_image = pdf.extract_image(image["object_number"])
-                    image_obj = Image.open(BytesIO(base_image["image"]))
-                    page_data["images"].append(image_obj)
-                return page_data
-            with ThreadPoolExecutor() as executor:
-                pages_data = list(executor.map(process_page, pdf.pages))
-            for page_data in pages_data:
-                data["text"] += page_data["text"]
-                data["tables"].extend(page_data["tables"])
-                data["images"].extend(page_data["images"])
     except Exception as e:
         logging.error(f"Error processing PDF: {e}")
     return data
-# Function to extract text from DOCX files
 @cache_data
 def extract_docx_data(docx_file):
     try:
@@ -70,65 +61,49 @@ def extract_docx_data(docx_file):
         text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
         return text
     except Exception as e:
-        logging.error(f"Error extracting DOCX content: {e}")
         return ""
-# Function to extract text from plain text files
 @cache_data
-def extract_text_file_data(text_file):
     try:
-        return text_file.read().decode("utf-8").strip()
     except Exception as e:
-        logging.error(f"Error extracting TXT content: {e}")
         return ""
-# Function to process extracted content (PDF, DOCX, etc.)
-def process_content(file_data, file_type, lang="eng"):
-    text = ""
     images = []
-    if file_type == "pdf":
-        pdf_data = extract_pdf_data(file_data)
-        text = process_pdf_content(pdf_data)
-        images = pdf_data["images"]
-    elif file_type == "docx":
-        text = extract_docx_data(file_data)
-    elif file_type == "txt":
-        text = extract_text_file_data(file_data)
-    elif file_type in ["png", "jpg", "jpeg"]:
-        image = Image.open(file_data)
-        images.append(image)
     ocr_text = extract_text_from_images(images, lang)
-    return text + "\n" + ocr_text
-# Function to process PDF content
-def process_pdf_content(pdf_data):
-    ocr_text = extract_text_from_images(pdf_data["images"])
-    combined_text = pdf_data["text"] + ocr_text
-    table_text = ""
-    for table in pdf_data["tables"]:
-        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
-        table_text += "\n".join(table_rows) + "\n"
-    return (combined_text + "\n" + table_text).strip()
-# Function to generate questions
-def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
     prompt_template = f"""
-    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
-    Subject: {subject_name}
-    Instructor: {instructor}
-    Class: {class_name}
-    Institution: {institution}
-    Syllabus Content: {syllabus_context}
     Difficulty Levels:
-    - Remember: {difficulty_level.get('Remember', 0)}
-    - Understand: {difficulty_level.get('Understand', 0)}
-    - Apply: {difficulty_level.get('Apply', 0)}
-    - Analyze: {difficulty_level.get('Analyze', 0)}
-    - Evaluate: {difficulty_level.get('Evaluate', 0)}
-    - Create: {difficulty_level.get('Create', 0)}
     Format questions as follows:
     Q1. ________________
     Q2. ________________
@@ -138,131 +113,87 @@ def generate_questions(question_type, subject_name, instructor, class_name, inst
     try:
         return chain.invoke({})
     except Exception as e:
-        logging.error(f"Error generating {question_type} questions: {e}")
         return ""
-# Function to generate answers
-def generate_answers(questions, syllabus_context):
-    prompt = f"""
-    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
-    Syllabus Content: {syllabus_context}
-    Questions:
-    {questions}
-    Format answers as follows:
-    Answer 1: ________________
-    Answer 2: ________________
-    ...
-    """
-    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
     try:
-        return chain.invoke({})
     except Exception as e:
-        logging.error(f"Error generating answers: {e}")
-        return ""
-# Streamlit app
-st.title("Bloom's Taxonomy Based Exam Paper Developer")
-# Sidebar Clear Data Button
-if st.sidebar.button("Clear All Data"):
-    st.session_state.clear()
-    st.success("All data has been cleared. You can now upload a new syllabus.")
-# File Upload with Image Support
-uploaded_file = st.sidebar.file_uploader(
-    "Upload Syllabus (PDF, DOCX, TXT, Image)",
-    type=["pdf", "docx", "txt", "png", "jpg", "jpeg"]
-)
-# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
-subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
-instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
-class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
-institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
-# Language Option for OCR
-ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
-if uploaded_file:
-    # Clear session state when a new file is uploaded
-    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
-        st.session_state.clear()
-        st.success("Previous data cleared. Processing new file...")
-    st.session_state.uploaded_filename = uploaded_file.name
-    file_type = uploaded_file.type.split("/")[-1]
-    # Validate file type
-    if file_type not in ["pdf", "docx", "txt", "png", "jpg", "jpeg"]:
-        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
-    else:
-        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
-        st.session_state.syllabus_text = syllabus_text
-# Preview of Syllabus
-if "syllabus_text" in st.session_state:
-    st.subheader("Syllabus Preview:")
-    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
-else:
-    st.warning("Please upload a syllabus to begin.")
-# Question Type Selection
-question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
-difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
-difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
-num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
-if st.sidebar.button("Generate Questions"):
     if "syllabus_text" in st.session_state:
-        with st.spinner(f"Generating {question_type}..."):
-            syllabus_context = st.session_state.syllabus_text
-            st.session_state.generated_questions = generate_questions(
-                question_type,
-                subject_name,
-                instructor_name,
-                class_name,
-                institution_name,
-                syllabus_context,
-                num_questions,
-                difficulty,
-            )
-        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
     else:
-        st.error("Please upload a syllabus before generating questions.")
-# Button to generate answers for questions
-if st.sidebar.button("Generate Answers for Questions"):
-    if "generated_questions" in st.session_state:
-        with st.spinner("Generating answers..."):
-            syllabus_context = st.session_state.syllabus_text
-            st.session_state.generated_answers = generate_answers(
-                st.session_state.generated_questions, syllabus_context
-            )
-        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
     else:
-        st.error("Generate questions first before generating answers.")
-# Download buttons for questions and answers
-if "generated_questions" in st.session_state:
-    st.sidebar.download_button(
-        label="Download Questions",
-        data=st.session_state.generated_questions,
-        file_name=f"{subject_name}_questions.txt",
-        mime="text/plain",
-    )
-if "generated_answers" in st.session_state:
-    st.sidebar.download_button(
-        label="Download Answers",
-        data=st.session_state.generated_answers,
-        file_name=f"{subject_name}_answers.txt",
-        mime="text/plain",
-    )
-# Enhanced footer for branding and information
-st.markdown("""
----
-**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
-Built with ♥ to make exam preparation seamless.
-""")

 import streamlit as st
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from streamlit.runtime.caching import cache_data
+import requests
+from bs4 import BeautifulSoup
 # Load environment variables
 load_dotenv()
 # Initialize LLM
 llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
+# OCR Configuration
+pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust based on your system's path
+# OCR Extraction from Images
 def extract_text_from_images(images, lang="eng"):
     ocr_text = ""
     for image in images:
             logging.error(f"Error in OCR: {e}")
     return ocr_text.strip()
+# Extract content from PDFs
 @cache_data
+def extract_pdf_data(pdf_file):
+    data = {"text": "", "images": []}
     try:
+        with pdfplumber.open(pdf_file) as pdf:
+            for page in pdf.pages:
+                data["text"] += page.extract_text() or ""
+                for img in page.images:
+                    base_image = pdf.extract_image(img["object_number"])
+                    image = Image.open(BytesIO(base_image["image"]))
+                    data["images"].append(image)
     except Exception as e:
         logging.error(f"Error processing PDF: {e}")
     return data
+# Extract content from DOCX
 @cache_data
 def extract_docx_data(docx_file):
     try:
         text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
         return text
     except Exception as e:
+        logging.error(f"Error processing DOCX: {e}")
         return ""
+# Extract plain text files
 @cache_data
+def extract_txt_data(txt_file):
     try:
+        return txt_file.read().decode("utf-8").strip()
     except Exception as e:
+        logging.error(f"Error processing TXT: {e}")
         return ""
+# Process uploaded files
+def process_files(uploaded_files, lang="eng"):
+    combined_text = ""
     images = []
+    for file in uploaded_files:
+        file_type = file.type.split("/")[-1]
+        if file_type == "pdf":
+            pdf_data = extract_pdf_data(file)
+            combined_text += pdf_data["text"]
+            images.extend(pdf_data["images"])
+        elif file_type == "docx":
+            combined_text += extract_docx_data(file)
+        elif file_type == "txt":
+            combined_text += extract_txt_data(file)
+        elif file_type in ["png", "jpg", "jpeg"]:
+            images.append(Image.open(file))
     ocr_text = extract_text_from_images(images, lang)
+    return combined_text + "\n" + ocr_text
+# Generate questions
+def generate_questions(question_type, syllabus_text, num_questions, difficulty):
     prompt_template = f"""
+    Generate {num_questions} {question_type} questions from the syllabus content provided below.
+    Syllabus Content: {syllabus_text}
     Difficulty Levels:
+    - Remember: {difficulty.get('Remember', 0)}
+    - Understand: {difficulty.get('Understand', 0)}
+    - Apply: {difficulty.get('Apply', 0)}
+    - Analyze: {difficulty.get('Analyze', 0)}
+    - Evaluate: {difficulty.get('Evaluate', 0)}
+    - Create: {difficulty.get('Create', 0)}
     Format questions as follows:
     Q1. ________________
     Q2. ________________
     try:
         return chain.invoke({})
     except Exception as e:
+        logging.error(f"Error generating questions: {e}")
         return ""
+# Internet search for answers
+def search_answers_online(question):
+    search_url = f"https://www.google.com/search?q={question}"
+    headers = {"User-Agent": "Mozilla/5.0"}
     try:
+        response = requests.get(search_url, headers=headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        snippets = soup.find_all("div", class_="BNeawe")
+        return "\n".join([snippet.get_text() for snippet in snippets[:3]])
     except Exception as e:
+        logging.error(f"Error fetching online answers: {e}")
+        return "No online answer found."
+# Generate answers
+def generate_answers(questions, syllabus_text):
+    answers = {}
+    for i, question in enumerate(questions.split("\n")):
+        if question.strip():
+            prompt = f"""
+            Based on the provided syllabus content, generate a detailed answer for the following question:
+            Syllabus Content: {syllabus_text}
+            Question: {question}
+            """
+            chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
+            try:
+                answers[f"Answer {i+1}"] = chain.invoke({})
+            except Exception:
+                # Fall back to online search if LLM fails
+                answers[f"Answer {i+1}"] = search_answers_online(question)
+    return "\n".join([f"{k}: {v}" for k, v in answers.items()])
+# Streamlit UI
+st.title("AI-Powered Exam Generator")
+# Tabs for navigation
+tab1, tab2, tab3, tab4 = st.tabs(["📁 Upload Files", "📄 Preview Content", "📝 Generate Questions", "💡 Generate Answers"])
+# Upload files
+with tab1:
+    st.header("Upload Files")
+    uploaded_files = st.file_uploader(
+        "Upload your syllabus (PDF, DOCX, TXT, Images)",
+        type=["pdf", "docx", "txt", "png", "jpg", "jpeg"],
+        accept_multiple_files=True
+    )
+    ocr_lang = st.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
+    if uploaded_files:
+        syllabus_text = process_files(uploaded_files, lang=ocr_lang)
+        st.session_state["syllabus_text"] = syllabus_text
+        st.success("Files processed successfully!")
+# Preview content
+with tab2:
+    st.header("Preview Syllabus Content")
     if "syllabus_text" in st.session_state:
+        st.text_area("Extracted Content", st.session_state["syllabus_text"], height=300)
     else:
+        st.warning("No content available. Upload files first.")
+# Generate questions
+with tab3:
+    st.header("Generate Questions")
+    question_type = st.selectbox("Select Question Type", ["MCQs", "Short Questions", "Long Questions"])
+    num_questions = st.slider("Number of Questions", 1, 20, 5)
+    difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
+    difficulty = {level: st.slider(level, 0, 5, 1) for level in difficulty_levels}
+    if st.button("Generate Questions"):
+        questions = generate_questions(question_type, st.session_state.get("syllabus_text", ""), num_questions, difficulty)
+        st.session_state["questions"] = questions
+        st.text_area("Generated Questions", questions, height=300)
+# Generate answers
+with tab4:
+    st.header("Generate Answers")
+    if "questions" in st.session_state:
+        if st.button("Generate Answers"):
+            answers = generate_answers(st.session_state["questions"], st.session_state.get("syllabus_text", ""))
+            st.session_state["answers"] = answers
+            st.text_area("Generated Answers", answers, height=300)
     else:
+        st.warning("No questions available. Generate questions first.")