import pdfplumber from langchain.text_splitter import CharacterTextSplitter from openai import OpenAI import json import numpy as np import time import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from dotenv import load_dotenv import os load_dotenv() def process_file(filepath): """Process file and generate chunks""" content = [] with pdfplumber.open(filepath) as pdf: for page in pdf.pages: text = page.extract_text() if text: # Avoid NoneType errors content.append(text) # Join extracted text with proper spacing full_text = "\n\n".join(content) # Apply chunking text_splitter = CharacterTextSplitter( chunk_size=50000, chunk_overlap=10 ) chunks = text_splitter.split_text(full_text) # Vectorize and get similarities query = "" vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform([query] + chunks) cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten() # Select top chunks if len(chunks) > 8: top_n = int(len(chunks)/2) else: top_n = len(chunks) if len(query) < 5: top_n = len(chunks) top_indices = cosine_similarities.argsort()[-top_n:][::-1] return chunks def givemcqquestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions): if level == 1: game = "easy and non-tricky with simple options" elif level == 2: game = "tricky and medium-level lengthy questions" elif level == 3: game = "hard and tricky and lengthy questions" else: raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).") prompt = f"""You are an AI designed to generate high-quality {game} level multiple-choice questions (MCQs) for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding. Instructions: For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity: - **Knowledge (Remembering)**: Formulate a factual or recall-based question. - **Comprehension (Understanding)**: Create a question that requires explanation or interpretation. - **Application (Applying)**: Develop a question that applies knowledge to a new situation. - **Analysis (Analyzing)**: Design a question that encourages breaking down concepts. - **Synthesis (Creating)**: Construct a question requiring idea combination or new approaches. - **Evaluation (Evaluating)**: Generate a question that involves judgment or assessment. STRICT RULES: - Generate exactly **{questions} MCQ** based on the given context and Bloom’s Taxonomy level. - Return the response as a **structured JSON object** without any additional text. - The question should reflect the complexity required for the given cognitive level. - Options should be **plausible, with only one correct answer** clearly identifiable. - Ensure a structured rubric to evaluate student responses. Input Parameters: - **Context**: {chunks} (Relevant learning material) - **Bloom’s Taxonomy Distribution**: - Understanding: {understand*100}% - Analysis: {analyze*100}% - Evaluation: {evaluate*100}% - Synthesis: {create*100}% - Application: {apply*100}% - Knowledge: {remember*100}% Expected JSON Output Format: {{ "question": "", "options": {{ "A": "