File size: 3,397 Bytes
9ec092d
eb520df
 
af56656
9ec092d
af56656
a676c64
272ae2f
a676c64
 
272ae2f
a676c64
 
272ae2f
 
a676c64
 
 
272ae2f
a676c64
 
 
 
 
 
c6b9400
272ae2f
c6b9400
272ae2f
 
c2a3f49
272ae2f
c2a3f49
eb520df
a676c64
272ae2f
 
 
af56656
 
272ae2f
 
 
 
c6b9400
272ae2f
c2a3f49
 
 
 
 
 
 
272ae2f
c6b9400
c2a3f49
272ae2f
c2a3f49
272ae2f
 
c6b9400
c2a3f49
272ae2f
c2a3f49
272ae2f
 
af56656
c2a3f49
272ae2f
c2a3f49
272ae2f
 
a676c64
c2a3f49
 
 
6923530
c2a3f49
272ae2f
 
c2a3f49
272ae2f
 
c2a3f49
272ae2f
af56656
272ae2f
af56656
272ae2f
c2a3f49
272ae2f
6923530
af56656
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import re
import pandas as pd
from typing import List, Dict

def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]:
    if not raw_text:
        ext = filepath.rsplit('.', 1)[-1].lower()
        if ext == 'pdf':
            import pdfplumber
            pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    pages.append(page.extract_text() or '')
            raw_text = '\n'.join(pages)
        elif ext == 'docx':
            from docx import Document
            doc = Document(filepath)
            raw_text = '\n'.join(p.text for p in doc.paragraphs)
        elif ext in ['xls', 'xlsx']:
            df = pd.read_excel(filepath)
            return df.to_dict(orient='records')
        elif ext == 'csv':
            df = pd.read_csv(filepath)
            return df.to_dict(orient='records')
        else:
            raise ValueError('Unsupported file format')

    # Normalize newlines
    text = re.sub(r"\r\n?", "\n", raw_text)
    # Split text into segments starting with numbered questions
    segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text)

    mcqs = []

    for seg in segments:
        lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
        if not lines:
            continue

        question = ''
        options = []
        answer = ''
        explanation = ''

        for ln in lines:
            # Identify question (first line or continuation)
            q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln)
            if q_match and not question:
                question = q_match.group(2).strip()
                continue
            elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln):
                question += ' ' + ln  # continuation of question
                continue

            # Identify options
            opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln)
            if opt_match:
                options.append(opt_match.group(2).strip())
                continue

            # Identify answer
            ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE)
            if ans_match:
                answer = ans_match.group(1).upper()
                continue

            # Identify explicit explanation
            exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE)
            if exp_match:
                explanation = exp_match.group(1).strip()
                continue

            # Implicit continuation of explanation
            if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln):
                explanation += ' ' + ln.strip()

        # Save only valid MCQs (at least question + 2 options)
        if question and len(options) >= 2:
            mcqs.append({
                'question': question.strip(),
                'options': options,
                'answer': answer,
                'explanation': explanation.strip()
            })

    return mcqs

# ========================
# Split MCQs into chunks
# ========================

def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]:
    if len(mcqs) <= max_per_chunk:
        return [mcqs]
    return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]