File size: 13,743 Bytes
078d100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import pdfplumber
from langchain.text_splitter import CharacterTextSplitter
from openai import OpenAI
import json
import numpy as np
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os
load_dotenv()

def process_file(filepath):
    """Process file and generate chunks"""
    content = []
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:  # Avoid NoneType errors
                content.append(text)

    # Join extracted text with proper spacing
    full_text = "\n\n".join(content)
    
    # Apply chunking
    text_splitter = CharacterTextSplitter(
        chunk_size=50000,
        chunk_overlap=10
    )

    chunks = text_splitter.split_text(full_text)

    # Vectorize and get similarities
    query = ""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([query] + chunks)
    cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    
    # Select top chunks
    if len(chunks) > 8:
        top_n = int(len(chunks)/2)
    else:
        top_n = len(chunks)
    if len(query) < 5:
        top_n = len(chunks)
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    return chunks

    
def givemcqquestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
    if level == 1:
        game = "easy and non-tricky with simple options"
    elif level == 2:
        game = "tricky and medium-level lengthy questions"
    elif level == 3:
        game = "hard and tricky and lengthy questions"
    else:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    prompt = f"""You are an AI designed to generate high-quality {game} level multiple-choice questions (MCQs) for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.



Instructions:

For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:

- **Knowledge (Remembering)**: Formulate a factual or recall-based question.

- **Comprehension (Understanding)**: Create a question that requires explanation or interpretation.

- **Application (Applying)**: Develop a question that applies knowledge to a new situation.

- **Analysis (Analyzing)**: Design a question that encourages breaking down concepts.

- **Synthesis (Creating)**: Construct a question requiring idea combination or new approaches.

- **Evaluation (Evaluating)**: Generate a question that involves judgment or assessment.



STRICT RULES:

- Generate exactly **{questions} MCQ** based on the given context and Bloom’s Taxonomy level.

- Return the response as a **structured JSON object** without any additional text.

- The question should reflect the complexity required for the given cognitive level.

- Options should be **plausible, with only one correct answer** clearly identifiable.

- Ensure a structured rubric to evaluate student responses.



Input Parameters:

- **Context**: {chunks} (Relevant learning material)

- **Bloom’s Taxonomy Distribution**:

  - Understanding: {understand*100}%

  - Analysis: {analyze*100}%

  - Evaluation: {evaluate*100}%

  - Synthesis: {create*100}%

  - Application: {apply*100}%

  - Knowledge: {remember*100}%



Expected JSON Output Format:



{{

  "question": "<Your MCQ Question>",

  "options": {{

    "A": "<Option A>",

    "B": "<Option B>",

    "C": "<Option C>",

    "D": "<Option D>"

  }},

  "correct_answer": "<Correct Option>",

  "rubric": {{

    "key_concept_assessed": "<Briefly explain what is being tested>",

    "criteria_for_correct_answer": "<Explain why the correct answer is correct>",

    "common_misconceptions": "<List potential incorrect assumptions>",

    "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"

  }}

}}



"""
    print("API KEY",os.getenv("API_KEY"))
    print("BASE URL",os.getenv("GENERATOR_BASE_URL"))
    print("MODEL NAME",os.getenv("MODEL_NAME"))
    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    response = client.chat.completions.create(
        model=os.getenv("MODEL_NAME"),
        messages=[{"role": "user", "content": prompt}]
    )

    mcq = response.choices[0].message.content
    if "```json" in mcq:
      mcq = mcq.replace("```json","")
      mcq = mcq.replace("```","")
    mcq = mcq.replace("\n","")
    mcq = json.loads(mcq)
    return mcq,prompt


def givetruefalsequestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
    if level == 1:
        game = "easy and straightforward statements"
    elif level == 2:
        game = "moderate complexity with slight trickiness"
    elif level == 3:
        game = "complex and tricky statements requiring deep understanding"
    else:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    prompt = f"""You are an AI designed to generate high-quality {game} level **True/False** questions for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured **True/False** question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.



    ### **Instructions:**

    For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:

    - **Knowledge (Remembering)**: Generate a straightforward fact-based statement.

    - **Comprehension (Understanding)**: Formulate a statement that requires explanation or interpretation.

    - **Application (Applying)**: Develop a statement that applies knowledge to a new situation.

    - **Analysis (Analyzing)**: Design a statement that involves breaking down concepts.

    - **Synthesis (Creating)**: Construct a statement requiring combining ideas or new approaches.

    - **Evaluation (Evaluating)**: Generate a statement requiring judgment or assessment.



    ### **STRICT RULES:**

    - Generate exactly **{questions}** True/False question.

    - Return the response as a **structured JSON object** without any additional text.

    - The question should reflect the complexity required for the given cognitive level.

    - Ensure a structured rubric to evaluate student responses.



    ### **Input Parameters:**

    - **Context**: {chunks} (Relevant learning material)

    - **Bloom’s Taxonomy Distribution**:

    - Understanding: {understand*100}%

    - Analysis: {analyze*100}%

    - Evaluation: {evaluate*100}%

    - Synthesis: {create*100}%

    - Application: {apply*100}%

    - Knowledge: {remember*100}%



    ### **Expected JSON Output Format:**

    ```json

    {{

    "statement": "<Your True/False Statement>",

    "correct_answer": "<True or False>",

    "rubric": {{

        "key_concept_assessed": "<Briefly explain what is being tested>",

        "criteria_for_correct_answer": "<Explain why the correct answer is correct>",

        "common_misconceptions": "<List potential incorrect assumptions>",

        "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"

    }}

    }}

    """
    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    response = client.chat.completions.create(
        model=os.getenv("MODEL_NAME"),
        messages=[{"role": "user", "content": prompt}]
    )

    tf_question = response.choices[0].message.content
    if "```json" in tf_question:
        tf_question = tf_question.replace("```json", "").replace("```", "")
    tf_question = tf_question.replace("\n", "")
    tf_question = json.loads(tf_question)
    return tf_question




def giveopenquestion(chunks, create, evaluate, analyze, apply, understand, remember, level, questions):
    # Validate input parameters
    bloom_params = {
        'create': create,
        'evaluate': evaluate,
        'analyze': analyze,
        'apply': apply,
        'understand': understand,
        'remember': remember
    }
    
    if not all(0 <= val <= 1 for val in bloom_params.values()):
        raise ValueError("All Bloom's parameters must be between 0 and 1")
    
    if level not in [1, 2, 3]:
        raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")

    # Complexity description
    complexity_levels = {
        1: "simple recall-based questions",
        2: "moderate explanation questions", 
        3: "complex analytical questions"
    }
    complexity = complexity_levels.get(level)

    prompt = f"""Generate {questions} open-ended question(s) based on the provided context, strictly following these requirements:



    ### CONTEXT:

    {chunks}



    ### BLOOM'S TAXONOMY DISTRIBUTION:

    - Creating: {create*100}%

    - Evaluating: {evaluate*100}%

    - Analyzing: {analyze*100}%

    - Applying: {apply*100}%

    - Understanding: {understand*100}%

    - Remembering: {remember*100}%



    ### COGNITIVE LEVEL:

    {complexity} (Level {level})



    ### OUTPUT REQUIREMENTS:

    - Return ONLY valid JSON format

    - Include detailed rubric with cognitive skill mapping

    - For each question, specify which Bloom's level it primarily targets



    ### RESPONSE FORMAT:

    ```json

    {{

        "metadata": {{

            "blooms_distribution": {{

                "create": {create},

                "evaluate": {evaluate},

                "analyze": {analyze},

                "apply": {apply},

                "understand": {understand},

                "remember": {remember}

            }},

            "complexity_level": {level}

        }},

        "questions": [

            {{

                "question": "Question text",

                "primary_blooms_level": "create|evaluate|analyze|apply|understand|remember",

                "rubric": {{

                    "key_concept": "...",

                    "criteria": "...",

                    "misconceptions": "...",

                    "cognitive_skills": {{

                        "primary": "...",

                        "secondary": ["...", "..."]

                    }}

                }}

            }}

        ]

    }}

    ```

    """

    client = OpenAI(
        api_key=os.getenv("API_KEY"),
        base_url=os.getenv("GENERATOR_BASE_URL")
    )

    try:
        response = client.chat.completions.create(
            model=os.getenv("MODEL_NAME"),
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=1200
        )
        raw_response = response.choices[0].message.content

        # Extract JSON from response
        json_match = re.search(r'```json(.*?)```', raw_response, re.DOTALL)
        json_str = json_match.group(1).strip() if json_match else raw_response.strip()

        # Parse and validate JSON
        result = json.loads(json_str)
        
        # Validate structure
        if not all(key in result for key in ['metadata', 'questions']):
            raise ValueError("Response missing required fields")
            
        return result

    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        print(f"Problematic response:\n{raw_response}")
        raise
    except Exception as e:
        print(f"API Error: {e}")
        raise
    


def generate_questions_from_file(filepath, mcq, tf, qna, create, evaluate, analyze, apply, understand, remember, level):
    """Main function to generate questions from file"""
    # Process file first
    chunks = process_file(filepath)
    
    # Generate questions using existing functionality
    MAX_RETRIES = 3
    RETRY_DELAY = 1
    
    def get_random_chunk():
        return chunks[np.random.randint(len(chunks))] if chunks else ""
    
    def generate_questions(q_type, count, generator):
        results = []
        for _ in range(count):
            for attempt in range(MAX_RETRIES):
                try:
                    chunk = get_random_chunk()
                    question = generator(chunk, create, evaluate, analyze, apply, 
                                       understand, remember, level, questions=1)
                    results.append(question)
                    break
                except Exception as e:
                    print(f"Error generating {q_type} question (attempt {attempt+1}): {str(e)}")
                    if attempt == MAX_RETRIES - 1:
                        results.append({"error": f"Failed to generate {q_type} question"})
                    time.sleep(RETRY_DELAY)
        return results

    return {
        'mcq': generate_questions("MCQ", mcq, givemcqquestion),
        'tf': generate_questions("True/False", tf, givetruefalsequestion),
        'qna': generate_questions("Q&A", qna, giveopenquestion)
    }

if __name__ == "__main__":
    # Example usage
    filepath = "data/eco.pdf"
    mcq = 1
    tf = 1
    qna = 1
    level = 1
    create = 0.2
    evaluate = 0.2
    analyze = 0.2
    apply = 0.2
    understand = 0.2
    remember = 0.2
    
    questions = generate_questions_from_file(
        filepath, mcq, tf, qna, create, evaluate, 
        analyze, apply, understand, remember, level
    )
    print(questions)