mcq2vid / model_utils.py
roshcheeku's picture
Update model_utils.py
c2a3f49 verified
import os
import re
import pandas as pd
from typing import List, Dict
def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]:
if not raw_text:
ext = filepath.rsplit('.', 1)[-1].lower()
if ext == 'pdf':
import pdfplumber
pages = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
pages.append(page.extract_text() or '')
raw_text = '\n'.join(pages)
elif ext == 'docx':
from docx import Document
doc = Document(filepath)
raw_text = '\n'.join(p.text for p in doc.paragraphs)
elif ext in ['xls', 'xlsx']:
df = pd.read_excel(filepath)
return df.to_dict(orient='records')
elif ext == 'csv':
df = pd.read_csv(filepath)
return df.to_dict(orient='records')
else:
raise ValueError('Unsupported file format')
# Normalize newlines
text = re.sub(r"\r\n?", "\n", raw_text)
# Split text into segments starting with numbered questions
segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text)
mcqs = []
for seg in segments:
lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
if not lines:
continue
question = ''
options = []
answer = ''
explanation = ''
for ln in lines:
# Identify question (first line or continuation)
q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln)
if q_match and not question:
question = q_match.group(2).strip()
continue
elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln):
question += ' ' + ln # continuation of question
continue
# Identify options
opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln)
if opt_match:
options.append(opt_match.group(2).strip())
continue
# Identify answer
ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE)
if ans_match:
answer = ans_match.group(1).upper()
continue
# Identify explicit explanation
exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE)
if exp_match:
explanation = exp_match.group(1).strip()
continue
# Implicit continuation of explanation
if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln):
explanation += ' ' + ln.strip()
# Save only valid MCQs (at least question + 2 options)
if question and len(options) >= 2:
mcqs.append({
'question': question.strip(),
'options': options,
'answer': answer,
'explanation': explanation.strip()
})
return mcqs
# ========================
# Split MCQs into chunks
# ========================
def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]:
if len(mcqs) <= max_per_chunk:
return [mcqs]
return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]