Spaces:

ahm14
/

Exam_Developer

Sleeping

App Files Files Community

ahm14 commited on Jan 5, 2025

Commit

7337e1e

verified ·

1 Parent(s): efbebca

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -93

app.py CHANGED Viewed

@@ -3,49 +3,24 @@ from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from dotenv import load_dotenv
-import os
 import pytesseract
 from PIL import Image
 import pdfplumber
 import docx
 from io import BytesIO
-from sentence_transformers import SentenceTransformer
-from pinecone import Pinecone, ServerlessSpec
 import logging
 # Load environment variables
 load_dotenv()
 # Initialize logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Initialize LLM
 llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
-# Initialize Pinecone for vector storage
-PINECONE_API_KEY = "pcsk_6PtxDh_6tortuWyNhXdmVrAjx1ZSv8bQRcbgbE7j3JtwwcpMCkFfdsp6VC925WxmqpNYQC"
-pc = Pinecone(api_key=PINECONE_API_KEY)
-cloud = os.getenv('PINECONE_CLOUD', 'aws')
-region = os.getenv('PINECONE_REGION', 'us-east-1')
-spec = ServerlessSpec(cloud=cloud, region=region)
-index_name = "syllabus-index"
-if index_name not in pc.list_indexes().names():
-    pc.create_index(
-        name=index_name,
-        dimension=384,
-        spec=spec
-    )
-index = pc.Index(index_name)
-# Initialize embedding model
-embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 # OCR Configuration for Pytesseract
-pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Adjust to your system's path
 # Function to extract text, images, tables, and formulas from PDF
 def extract_pdf_data(pdf_path):
@@ -53,13 +28,10 @@ def extract_pdf_data(pdf_path):
     try:
         with pdfplumber.open(pdf_path) as pdf:
             for page in pdf.pages:
-                # Extract Text
                 data["text"] += page.extract_text() or ""
-                # Extract Tables
                 tables = page.extract_tables()
                 for table in tables:
                     data["tables"].append(table)
-                # Extract Images
                 for image in page.images:
                     base_image = pdf.extract_image(image["object_number"])
                     image_obj = Image.open(BytesIO(base_image["image"]))
@@ -70,25 +42,34 @@ def extract_pdf_data(pdf_path):
 # Function to extract text from DOCX files
 def extract_docx_data(docx_file):
-    doc = docx.Document(docx_file)
-    text = ""
-    for para in doc.paragraphs:
-        text += para.text + "\n"
-    return text
 # Function to extract text from plain text files
 def extract_text_file_data(text_file):
-    return text_file.read().decode('utf-8')
 # Function to extract text from images using OCR
 def extract_text_from_images(images):
     ocr_text = ""
     for image in images:
-        ocr_text += pytesseract.image_to_string(image) + "\n"
-    return ocr_text
 # Function to process extracted content (PDF, DOCX, etc.)
-def process_content(file_data, file_type="pdf"):
     text = ""
     images = []
     if file_type == "pdf":
@@ -99,48 +80,34 @@ def process_content(file_data, file_type="pdf"):
         text = extract_docx_data(file_data)
     elif file_type == "txt":
         text = extract_text_file_data(file_data)
     ocr_text = extract_text_from_images(images)
     return text + "\n" + ocr_text
 # Function to process PDF content
 def process_pdf_content(pdf_data):
-    # Process OCR text from images
     ocr_text = extract_text_from_images(pdf_data["images"])
     combined_text = pdf_data["text"] + ocr_text
-    # Process tables into readable text
     table_text = ""
     for table in pdf_data["tables"]:
-        table_rows = [" | ".join(row) for row in table]
         table_text += "\n".join(table_rows) + "\n"
-    return combined_text + "\n" + table_text
-# Function to add syllabus to vector database
-def add_syllabus_to_index(syllabus_text):
-    sentences = syllabus_text.split(". ")
-    embeddings = embedder.encode(sentences, batch_size=32, show_progress_bar=True)
-    for i, sentence in enumerate(sentences):
-        index.upsert([(f"sentence-{i}", embeddings[i].tolist(), {"text": sentence})])
-# Function to retrieve relevant syllabus content
-def retrieve_relevant_content(query):
-    try:
-        query_embedding = embedder.encode([query])
-        results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True)
-        relevant_content = "\n".join([match["metadata"]["text"] for match in results["matches"]])
-        return relevant_content
-    except Exception as e:
-        logging.error(f"Error retrieving content: {e}")
-        return ""
 # Function to generate questions
-def generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty_level):
     prompt_template = f"""
     Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
     Subject: {subject_name}
     Syllabus Content: {syllabus_context}
     Difficulty Levels:
@@ -190,25 +157,45 @@ def generate_answers(questions, syllabus_context):
 # Streamlit app
 st.title("Bloom's Taxonomy Based Exam Paper Developer")
-# Sidebar inputs
-instructor_name = st.sidebar.text_input("Instructor")
-class_name = st.sidebar.text_input("Class")
-institution_name = st.sidebar.text_input("Institution")
-subject_name = st.sidebar.text_input("Subject")
-# Syllabus Upload
-uploaded_file = st.sidebar.file_uploader("Upload Syllabus (PDF, DOCX, TXT, Image)", type=["pdf", "docx", "txt", "png", "jpg"])
-syllabus_text = None
 if uploaded_file:
-    file_type = uploaded_file.type.split("/")[1]
-    st.sidebar.markdown("✅ Syllabus uploaded")
-    syllabus_text = process_content(uploaded_file, file_type)
-    add_syllabus_to_index(syllabus_text)
 # Preview of Syllabus
-if syllabus_text:
     st.subheader("Syllabus Preview:")
-    st.text_area("Extracted Content", syllabus_text[:1000], height=300)
 # Question Type Selection
 question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
@@ -216,31 +203,25 @@ difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "
 difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
 num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
-# Instructor Feedback Option
-feedback = st.sidebar.text_area("Instructor Feedback (Optional)")
-# Generate Questions
 if st.sidebar.button("Generate Questions"):
-    if syllabus_text:
         with st.spinner(f"Generating {question_type}..."):
-            syllabus_context = retrieve_relevant_content(f"Generate {question_type} based on syllabus")
-            st.session_state.generated_questions = generate_questions(question_type, subject_name, syllabus_context, num_questions, difficulty)
         st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
     else:
         st.error("Please upload a syllabus before generating questions.")
-# Generate Answers
 if st.sidebar.button("Generate Answers for Questions"):
-    if "generated_questions" in st.session_state and st.session_state.generated_questions:
         with st.spinner("Generating answers..."):
-            syllabus_context = retrieve_relevant_content("Generate answers from syllabus")
             st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
         st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
     else:
         st.error("Generate questions first before generating answers.")
-# Download Options
-if "generated_questions" in st.session_state and st.session_state.generated_questions:
     st.sidebar.download_button(
         label="Download Questions",
         data=st.session_state.generated_questions,
@@ -248,7 +229,7 @@ if "generated_questions" in st.session_state and st.session_state.generated_ques
         mime="text/plain",
     )
-if "generated_answers" in st.session_state and st.session_state.generated_answers:
     st.sidebar.download_button(
         label="Download Answers",
         data=st.session_state.generated_answers,
@@ -256,8 +237,7 @@ if "generated_answers" in st.session_state and st.session_state.generated_answer
         mime="text/plain",
     )
-# Application Footer
-st.markdown("""
----
-**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
 """)

 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from dotenv import load_dotenv
 import pytesseract
 from PIL import Image
 import pdfplumber
 import docx
 from io import BytesIO
 import logging
 # Load environment variables
 load_dotenv()
 # Initialize logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Initialize LLM
 llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
 # OCR Configuration for Pytesseract
+pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
 # Function to extract text, images, tables, and formulas from PDF
 def extract_pdf_data(pdf_path):
     try:
         with pdfplumber.open(pdf_path) as pdf:
             for page in pdf.pages:
                 data["text"] += page.extract_text() or ""
                 tables = page.extract_tables()
                 for table in tables:
                     data["tables"].append(table)
                 for image in page.images:
                     base_image = pdf.extract_image(image["object_number"])
                     image_obj = Image.open(BytesIO(base_image["image"]))
 # Function to extract text from DOCX files
 def extract_docx_data(docx_file):
+    try:
+        doc = docx.Document(docx_file)
+        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting DOCX content: {e}")
+        return ""
 # Function to extract text from plain text files
 def extract_text_file_data(text_file):
+    try:
+        return text_file.read().decode("utf-8").strip()
+    except Exception as e:
+        logging.error(f"Error extracting TXT content: {e}")
+        return ""
 # Function to extract text from images using OCR
 def extract_text_from_images(images):
     ocr_text = ""
     for image in images:
+        try:
+            ocr_text += pytesseract.image_to_string(image).strip() + "\n"
+        except Exception as e:
+            logging.error(f"Error in OCR: {e}")
+    return ocr_text.strip()
 # Function to process extracted content (PDF, DOCX, etc.)
+def process_content(file_data, file_type):
     text = ""
     images = []
     if file_type == "pdf":
         text = extract_docx_data(file_data)
     elif file_type == "txt":
         text = extract_text_file_data(file_data)
+    elif file_type in ["png", "jpg", "jpeg"]:
+        image = Image.open(file_data)
+        images.append(image)
     ocr_text = extract_text_from_images(images)
     return text + "\n" + ocr_text
 # Function to process PDF content
 def process_pdf_content(pdf_data):
     ocr_text = extract_text_from_images(pdf_data["images"])
     combined_text = pdf_data["text"] + ocr_text
     table_text = ""
     for table in pdf_data["tables"]:
+        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
         table_text += "\n".join(table_rows) + "\n"
+    return (combined_text + "\n" + table_text).strip()
 # Function to generate questions
+def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
     prompt_template = f"""
     Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
     Subject: {subject_name}
+    Instructor: {instructor}
+    Class: {class_name}
+    Institution: {institution}
     Syllabus Content: {syllabus_context}
     Difficulty Levels:
 # Streamlit app
 st.title("Bloom's Taxonomy Based Exam Paper Developer")
+# Sidebar Clear Data Button
+if st.sidebar.button("Clear All Data"):
+    st.session_state.clear()
+    st.success("All data has been cleared. You can now upload a new syllabus.")
+# Syllabus Upload with Automatic Clearing
+uploaded_file = st.sidebar.file_uploader(
+    "Upload Syllabus (PDF, DOCX, TXT, Image)",
+    type=["pdf", "docx", "txt", "png", "jpg"]
+)
+# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
+subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
+instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
+class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
+institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
 if uploaded_file:
+    # Clear session state when a new file is uploaded
+    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
+        st.session_state.clear()
+        st.success("Previous data cleared. Processing new file...")
+    st.session_state.uploaded_filename = uploaded_file.name
+    file_type = uploaded_file.type.split("/")[-1]
+    # Validate file type
+    if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
+        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
+    else:
+        syllabus_text = process_content(uploaded_file, file_type)
+        st.session_state.syllabus_text = syllabus_text
 # Preview of Syllabus
+if "syllabus_text" in st.session_state:
     st.subheader("Syllabus Preview:")
+    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
+else:
+    st.warning("Please upload a syllabus to begin.")
 # Question Type Selection
 question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
 difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
 num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
 if st.sidebar.button("Generate Questions"):
+    if "syllabus_text" in st.session_state:
         with st.spinner(f"Generating {question_type}..."):
+            syllabus_context = st.session_state.syllabus_text
+            st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
         st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
     else:
         st.error("Please upload a syllabus before generating questions.")
 if st.sidebar.button("Generate Answers for Questions"):
+    if "generated_questions" in st.session_state:
         with st.spinner("Generating answers..."):
+            syllabus_context = st.session_state.syllabus_text
             st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
         st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
     else:
         st.error("Generate questions first before generating answers.")
+if "generated_questions" in st.session_state:
     st.sidebar.download_button(
         label="Download Questions",
         data=st.session_state.generated_questions,
         mime="text/plain",
     )
+if "generated_answers" in st.session_state:
     st.sidebar.download_button(
         label="Download Answers",
         data=st.session_state.generated_answers,
         mime="text/plain",
     )
+st.markdown("""
+---
+**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
 """)