Spaces:

jayantp2003
/

Bloomsphere-app

Sleeping

File size: 13,743 Bytes

078d100

import pdfplumber
from langchain.text_splitter import CharacterTextSplitter
from openai import OpenAI
import json
import numpy as np
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os
load_dotenv()

def process_file(filepath):
    """Process file and generate chunks"""
    content = []
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:  # Avoid NoneType errors
                content.append(text)

    # Join extracted text with proper spacing
    full_text = "\n\n".join(content)
    
    # Apply chunking
    text_splitter = CharacterTextSplitter(
        chunk_size=50000,
        chunk_overlap=10
    )

    chunks = text_splitter.split_text(full_text)

    # Vectorize and get similarities
    query = ""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([query] + chunks)
    cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    
    # Select top chunks
    if len(chunks) > 8:
        top_n = int(len(chunks)/2)
    else:
        top_n = len(chunks)
    if len(query) < 5:
        top_n = len(chunks)
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    return chunks

    
def givemcqquestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
    if level == 1:
        game = "easy and non-tricky with simple options"
    elif level == 2:
        game = "tricky and medium-level lengthy questions"
    elif level == 3:
        game = "hard and tricky and lengthy questions"
    else:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    prompt = f"""You are an AI designed to generate high-quality {game} level multiple-choice questions (MCQs) for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.



Instructions:

For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:

- **Knowledge (Remembering)**: Formulate a factual or recall-based question.

- **Comprehension (Understanding)**: Create a question that requires explanation or interpretation.

- **Application (Applying)**: Develop a question that applies knowledge to a new situation.

- **Analysis (Analyzing)**: Design a question that encourages breaking down concepts.

- **Synthesis (Creating)**: Construct a question requiring idea combination or new approaches.

- **Evaluation (Evaluating)**: Generate a question that involves judgment or assessment.



STRICT RULES:

- Generate exactly **{questions} MCQ** based on the given context and Bloom’s Taxonomy level.

- Return the response as a **structured JSON object** without any additional text.

- The question should reflect the complexity required for the given cognitive level.

- Options should be **plausible, with only one correct answer** clearly identifiable.

- Ensure a structured rubric to evaluate student responses.



Input Parameters:

- **Context**: {chunks} (Relevant learning material)

- **Bloom’s Taxonomy Distribution**:

  - Understanding: {understand*100}%

  - Analysis: {analyze*100}%

  - Evaluation: {evaluate*100}%

  - Synthesis: {create*100}%

  - Application: {apply*100}%

  - Knowledge: {remember*100}%



Expected JSON Output Format:



{{

  "question": "<Your MCQ Question>",

  "options": {{

    "A": "<Option A>",

    "B": "<Option B>",

    "C": "<Option C>",

    "D": "<Option D>"

  }},

  "correct_answer": "<Correct Option>",

  "rubric": {{

    "key_concept_assessed": "<Briefly explain what is being tested>",

    "criteria_for_correct_answer": "<Explain why the correct answer is correct>",

    "common_misconceptions": "<List potential incorrect assumptions>",

    "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"

  }}

}}



"""
    print("API KEY",os.getenv("API_KEY"))
    print("BASE URL",os.getenv("GENERATOR_BASE_URL"))
    print("MODEL NAME",os.getenv("MODEL_NAME"))
    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    response = client.chat.completions.create(
        model=os.getenv("MODEL_NAME"),
        messages=[{"role": "user", "content": prompt}]
    )

    mcq = response.choices[0].message.content
    if "```json" in mcq:
      mcq = mcq.replace("```json","")
      mcq = mcq.replace("```","")
    mcq = mcq.replace("\n","")
    mcq = json.loads(mcq)
    return mcq,prompt


def givetruefalsequestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
    if level == 1:
        game = "easy and straightforward statements"
    elif level == 2:
        game = "moderate complexity with slight trickiness"
    elif level == 3:
        game = "complex and tricky statements requiring deep understanding"
    else:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    prompt = f"""You are an AI designed to generate high-quality {game} level **True/False** questions for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured **True/False** question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.



    ### **Instructions:**

    For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:

    - **Knowledge (Remembering)**: Generate a straightforward fact-based statement.

    - **Comprehension (Understanding)**: Formulate a statement that requires explanation or interpretation.

    - **Application (Applying)**: Develop a statement that applies knowledge to a new situation.

    - **Analysis (Analyzing)**: Design a statement that involves breaking down concepts.

    - **Synthesis (Creating)**: Construct a statement requiring combining ideas or new approaches.

    - **Evaluation (Evaluating)**: Generate a statement requiring judgment or assessment.



    ### **STRICT RULES:**

    - Generate exactly **{questions}** True/False question.

    - Return the response as a **structured JSON object** without any additional text.

    - The question should reflect the complexity required for the given cognitive level.

    - Ensure a structured rubric to evaluate student responses.



    ### **Input Parameters:**

    - **Context**: {chunks} (Relevant learning material)

    - **Bloom’s Taxonomy Distribution**:

    - Understanding: {understand*100}%

    - Analysis: {analyze*100}%

    - Evaluation: {evaluate*100}%

    - Synthesis: {create*100}%

    - Application: {apply*100}%

    - Knowledge: {remember*100}%



    ### **Expected JSON Output Format:**

    ```json

    {{

    "statement": "<Your True/False Statement>",

    "correct_answer": "<True or False>",

    "rubric": {{

        "key_concept_assessed": "<Briefly explain what is being tested>",

        "criteria_for_correct_answer": "<Explain why the correct answer is correct>",

        "common_misconceptions": "<List potential incorrect assumptions>",

        "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"

    }}

    }}

    """
    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    response = client.chat.completions.create(
        model=os.getenv("MODEL_NAME"),
        messages=[{"role": "user", "content": prompt}]
    )

    tf_question = response.choices[0].message.content
    if "```json" in tf_question:
        tf_question = tf_question.replace("```json", "").replace("```", "")
    tf_question = tf_question.replace("\n", "")
    tf_question = json.loads(tf_question)
    return tf_question




def giveopenquestion(chunks, create, evaluate, analyze, apply, understand, remember, level, questions):
    # Validate input parameters
    bloom_params = {
        'create': create,
        'evaluate': evaluate,
        'analyze': analyze,
        'apply': apply,
        'understand': understand,
        'remember': remember
    }
    
    if not all(0 <= val <= 1 for val in bloom_params.values()):
        raise ValueError("All Bloom's parameters must be between 0 and 1")
    
    if level not in [1, 2, 3]:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    # Complexity description
    complexity_levels = {
        1: "simple recall-based questions",
        2: "moderate explanation questions", 
        3: "complex analytical questions"
    }
    complexity = complexity_levels.get(level)

    prompt = f"""Generate {questions} open-ended question(s) based on the provided context, strictly following these requirements:



    ### CONTEXT:

    {chunks}



    ### BLOOM'S TAXONOMY DISTRIBUTION:

    - Creating: {create*100}%

    - Evaluating: {evaluate*100}%

    - Analyzing: {analyze*100}%

    - Applying: {apply*100}%

    - Understanding: {understand*100}%

    - Remembering: {remember*100}%



    ### COGNITIVE LEVEL:

    {complexity} (Level {level})



    ### OUTPUT REQUIREMENTS:

    - Return ONLY valid JSON format

    - Include detailed rubric with cognitive skill mapping

    - For each question, specify which Bloom's level it primarily targets



    ### RESPONSE FORMAT:

    ```json

    {{

        "metadata": {{

            "blooms_distribution": {{

                "create": {create},

                "evaluate": {evaluate},

                "analyze": {analyze},

                "apply": {apply},

                "understand": {understand},

                "remember": {remember}

            }},

            "complexity_level": {level}

        }},

        "questions": [

            {{

                "question": "Question text",

                "primary_blooms_level": "create|evaluate|analyze|apply|understand|remember",

                "rubric": {{

                    "key_concept": "...",

                    "criteria": "...",

                    "misconceptions": "...",

                    "cognitive_skills": {{

                        "primary": "...",

                        "secondary": ["...", "..."]

                    }}

                }}

            }}

        ]

    }}

    ```

    """

    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    try:
        response = client.chat.completions.create(
            model=os.getenv("MODEL_NAME"),
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=1200
        )
        raw_response = response.choices[0].message.content

        # Extract JSON from response
        json_match = re.search(r'```json(.*?)```', raw_response, re.DOTALL)
        json_str = json_match.group(1).strip() if json_match else raw_response.strip()

        # Parse and validate JSON
        result = json.loads(json_str)
        
        # Validate structure
        if not all(key in result for key in ['metadata', 'questions']):
            raise ValueError("Response missing required fields")
            
        return result

    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        print(f"Problematic response:\n{raw_response}")
        raise
    except Exception as e:
        print(f"API Error: {e}")
        raise
    


def generate_questions_from_file(filepath, mcq, tf, qna, create, evaluate, analyze, apply, understand, remember, level):
    """Main function to generate questions from file"""
    # Process file first
    chunks = process_file(filepath)
    
    # Generate questions using existing functionality
    MAX_RETRIES = 3
    RETRY_DELAY = 1
    
    def get_random_chunk():
        return chunks[np.random.randint(len(chunks))] if chunks else ""
    
    def generate_questions(q_type, count, generator):
        results = []
        for _ in range(count):
            for attempt in range(MAX_RETRIES):
                try:
                    chunk = get_random_chunk()
                    question = generator(chunk, create, evaluate, analyze, apply, 
                                       understand, remember, level, questions=1)
                    results.append(question)
                    break
                except Exception as e:
                    print(f"Error generating {q_type} question (attempt {attempt+1}): {str(e)}")
                    if attempt == MAX_RETRIES - 1:
                        results.append({"error": f"Failed to generate {q_type} question"})
                    time.sleep(RETRY_DELAY)
        return results

    return {
        'mcq': generate_questions("MCQ", mcq, givemcqquestion),
        'tf': generate_questions("True/False", tf, givetruefalsequestion),
        'qna': generate_questions("Q&A", qna, giveopenquestion)
    }

if __name__ == "__main__":
    # Example usage
    filepath = "data/eco.pdf"
    mcq = 1
    tf = 1
    qna = 1
    level = 1
    create = 0.2
    evaluate = 0.2
    analyze = 0.2
    apply = 0.2
    understand = 0.2
    remember = 0.2
    
    questions = generate_questions_from_file(
        filepath, mcq, tf, qna, create, evaluate, 
        analyze, apply, understand, remember, level
    )
    print(questions)