import os
import re
import pandas as pd
from typing import List, Dict

def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]:
    if not raw_text:
        ext = filepath.rsplit('.', 1)[-1].lower()
        if ext == 'pdf':
            import pdfplumber
            pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    pages.append(page.extract_text() or '')
            raw_text = '\n'.join(pages)
        elif ext == 'docx':
            from docx import Document
            doc = Document(filepath)
            raw_text = '\n'.join(p.text for p in doc.paragraphs)
        elif ext in ['xls', 'xlsx']:
            df = pd.read_excel(filepath)
            return df.to_dict(orient='records')
        elif ext == 'csv':
            df = pd.read_csv(filepath)
            return df.to_dict(orient='records')
        else:
            raise ValueError('Unsupported file format')

    # Normalize newlines
    text = re.sub(r"\r\n?", "\n", raw_text)
    # Split text into segments starting with numbered questions
    segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text)

    mcqs = []

    for seg in segments:
        lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
        if not lines:
            continue

        question = ''
        options = []
        answer = ''
        explanation = ''

        for ln in lines:
            # Identify question (first line or continuation)
            q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln)
            if q_match and not question:
                question = q_match.group(2).strip()
                continue
            elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln):
                question += ' ' + ln  # continuation of question
                continue

            # Identify options
            opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln)
            if opt_match:
                options.append(opt_match.group(2).strip())
                continue

            # Identify answer
            ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE)
            if ans_match:
                answer = ans_match.group(1).upper()
                continue

            # Identify explicit explanation
            exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE)
            if exp_match:
                explanation = exp_match.group(1).strip()
                continue

            # Implicit continuation of explanation
            if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln):
                explanation += ' ' + ln.strip()

        # Save only valid MCQs (at least question + 2 options)
        if question and len(options) >= 2:
            mcqs.append({
                'question': question.strip(),
                'options': options,
                'answer': answer,
                'explanation': explanation.strip()
            })

    return mcqs

# ========================
# Split MCQs into chunks
# ========================

def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]:
    if len(mcqs) <= max_per_chunk:
        return [mcqs]
    return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]