Spaces:
Sleeping
Sleeping
File size: 3,397 Bytes
9ec092d eb520df af56656 9ec092d af56656 a676c64 272ae2f a676c64 272ae2f a676c64 272ae2f a676c64 272ae2f a676c64 c6b9400 272ae2f c6b9400 272ae2f c2a3f49 272ae2f c2a3f49 eb520df a676c64 272ae2f af56656 272ae2f c6b9400 272ae2f c2a3f49 272ae2f c6b9400 c2a3f49 272ae2f c2a3f49 272ae2f c6b9400 c2a3f49 272ae2f c2a3f49 272ae2f af56656 c2a3f49 272ae2f c2a3f49 272ae2f a676c64 c2a3f49 6923530 c2a3f49 272ae2f c2a3f49 272ae2f c2a3f49 272ae2f af56656 272ae2f af56656 272ae2f c2a3f49 272ae2f 6923530 af56656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import re
import pandas as pd
from typing import List, Dict
def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]:
if not raw_text:
ext = filepath.rsplit('.', 1)[-1].lower()
if ext == 'pdf':
import pdfplumber
pages = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
pages.append(page.extract_text() or '')
raw_text = '\n'.join(pages)
elif ext == 'docx':
from docx import Document
doc = Document(filepath)
raw_text = '\n'.join(p.text for p in doc.paragraphs)
elif ext in ['xls', 'xlsx']:
df = pd.read_excel(filepath)
return df.to_dict(orient='records')
elif ext == 'csv':
df = pd.read_csv(filepath)
return df.to_dict(orient='records')
else:
raise ValueError('Unsupported file format')
# Normalize newlines
text = re.sub(r"\r\n?", "\n", raw_text)
# Split text into segments starting with numbered questions
segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text)
mcqs = []
for seg in segments:
lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
if not lines:
continue
question = ''
options = []
answer = ''
explanation = ''
for ln in lines:
# Identify question (first line or continuation)
q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln)
if q_match and not question:
question = q_match.group(2).strip()
continue
elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln):
question += ' ' + ln # continuation of question
continue
# Identify options
opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln)
if opt_match:
options.append(opt_match.group(2).strip())
continue
# Identify answer
ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE)
if ans_match:
answer = ans_match.group(1).upper()
continue
# Identify explicit explanation
exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE)
if exp_match:
explanation = exp_match.group(1).strip()
continue
# Implicit continuation of explanation
if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln):
explanation += ' ' + ln.strip()
# Save only valid MCQs (at least question + 2 options)
if question and len(options) >= 2:
mcqs.append({
'question': question.strip(),
'options': options,
'answer': answer,
'explanation': explanation.strip()
})
return mcqs
# ========================
# Split MCQs into chunks
# ========================
def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]:
if len(mcqs) <= max_per_chunk:
return [mcqs]
return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]
|