Spaces:

roshcheeku
/

mcq2vid

Sleeping

App Files Files Community

mcq2vid / model_utils.py

roshcheeku

Update model_utils.py

c2a3f49 verified 7 months ago

raw

history blame contribute delete

3.4 kB

	import os
	import re
	import pandas as pd
	from typing import List, Dict

	def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]:
	if not raw_text:
	ext = filepath.rsplit('.', 1)[-1].lower()
	if ext == 'pdf':
	import pdfplumber
	pages = []
	with pdfplumber.open(filepath) as pdf:
	for page in pdf.pages:
	pages.append(page.extract_text() or '')
	raw_text = '\n'.join(pages)
	elif ext == 'docx':
	from docx import Document
	doc = Document(filepath)
	raw_text = '\n'.join(p.text for p in doc.paragraphs)
	elif ext in ['xls', 'xlsx']:
	df = pd.read_excel(filepath)
	return df.to_dict(orient='records')
	elif ext == 'csv':
	df = pd.read_csv(filepath)
	return df.to_dict(orient='records')
	else:
	raise ValueError('Unsupported file format')

	# Normalize newlines
	text = re.sub(r"\r\n?", "\n", raw_text)
	# Split text into segments starting with numbered questions
	segments = re.split(r"(?m)(?=^\s*(?:Q\d+\|\d{1,3})[\)\.])", text)

	mcqs = []

	for seg in segments:
	lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
	if not lines:
	continue

	question = ''
	options = []
	answer = ''
	explanation = ''

	for ln in lines:
	# Identify question (first line or continuation)
	q_match = re.match(r"^(?:Q\d+\|(\d{1,3}))[\)\.]\s(.)", ln)
	if q_match and not question:
	question = q_match.group(2).strip()
	continue
	elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln):
	question += ' ' + ln # continuation of question
	continue

	# Identify options
	opt_match = re.match(r"^([A-Da-d])[\)\.]\s(.)", ln)
	if opt_match:
	options.append(opt_match.group(2).strip())
	continue

	# Identify answer
	ans_match = re.match(r"^(?:Answer\|Ans\|Correct answer)[\s:\-]\(?([A-Da-d])\)?.", ln, re.IGNORECASE)
	if ans_match:
	answer = ans_match.group(1).upper()
	continue

	# Identify explicit explanation
	exp_match = re.match(r"^(?:Explanation\|Why\|Because)[\s:\-](.)", ln, re.IGNORECASE)
	if exp_match:
	explanation = exp_match.group(1).strip()
	continue

	# Implicit continuation of explanation
	if answer and not re.match(r"^(?:Q\d+\|[A-Da-d][\)\.]\|\d{1,3}[\)\.])", ln):
	explanation += ' ' + ln.strip()

	# Save only valid MCQs (at least question + 2 options)
	if question and len(options) >= 2:
	mcqs.append({
	'question': question.strip(),
	'options': options,
	'answer': answer,
	'explanation': explanation.strip()
	})

	return mcqs

	# ========================
	# Split MCQs into chunks
	# ========================

	def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]:
	if len(mcqs) <= max_per_chunk:
	return [mcqs]
	return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]