Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import pandas as pd | |
| from typing import List, Dict | |
| def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]: | |
| if not raw_text: | |
| ext = filepath.rsplit('.', 1)[-1].lower() | |
| if ext == 'pdf': | |
| import pdfplumber | |
| pages = [] | |
| with pdfplumber.open(filepath) as pdf: | |
| for page in pdf.pages: | |
| pages.append(page.extract_text() or '') | |
| raw_text = '\n'.join(pages) | |
| elif ext == 'docx': | |
| from docx import Document | |
| doc = Document(filepath) | |
| raw_text = '\n'.join(p.text for p in doc.paragraphs) | |
| elif ext in ['xls', 'xlsx']: | |
| df = pd.read_excel(filepath) | |
| return df.to_dict(orient='records') | |
| elif ext == 'csv': | |
| df = pd.read_csv(filepath) | |
| return df.to_dict(orient='records') | |
| else: | |
| raise ValueError('Unsupported file format') | |
| # Normalize newlines | |
| text = re.sub(r"\r\n?", "\n", raw_text) | |
| # Split text into segments starting with numbered questions | |
| segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text) | |
| mcqs = [] | |
| for seg in segments: | |
| lines = [ln.strip() for ln in seg.splitlines() if ln.strip()] | |
| if not lines: | |
| continue | |
| question = '' | |
| options = [] | |
| answer = '' | |
| explanation = '' | |
| for ln in lines: | |
| # Identify question (first line or continuation) | |
| q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln) | |
| if q_match and not question: | |
| question = q_match.group(2).strip() | |
| continue | |
| elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln): | |
| question += ' ' + ln # continuation of question | |
| continue | |
| # Identify options | |
| opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln) | |
| if opt_match: | |
| options.append(opt_match.group(2).strip()) | |
| continue | |
| # Identify answer | |
| ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE) | |
| if ans_match: | |
| answer = ans_match.group(1).upper() | |
| continue | |
| # Identify explicit explanation | |
| exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE) | |
| if exp_match: | |
| explanation = exp_match.group(1).strip() | |
| continue | |
| # Implicit continuation of explanation | |
| if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln): | |
| explanation += ' ' + ln.strip() | |
| # Save only valid MCQs (at least question + 2 options) | |
| if question and len(options) >= 2: | |
| mcqs.append({ | |
| 'question': question.strip(), | |
| 'options': options, | |
| 'answer': answer, | |
| 'explanation': explanation.strip() | |
| }) | |
| return mcqs | |
| # ======================== | |
| # Split MCQs into chunks | |
| # ======================== | |
| def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]: | |
| if len(mcqs) <= max_per_chunk: | |
| return [mcqs] | |
| return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)] | |