import re
import fitz  # PyMuPDF

# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts and cleans text from a PDF using PyMuPDF.
    Handles both textual and scanned PDFs gracefully.

    Args:
        file_path (str): Path to the PDF file.
    Returns:
        str: Combined extracted text.
    """
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                page_text = page.get_text("text").strip()
                if not page_text:
                    # Fallback: extract raw blocks (helps with weird PDFs)
                    blocks = pdf.get_text("blocks")
                    page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                text += page_text + "\n"
    except Exception as e:
        raise RuntimeError(f"❌ PDF extraction failed: {e}")

    # Clean out any extra whitespace or control characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# -----------------------------
# SMART CHUNKING (Step-Aware + Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
    """
    Splits text into overlapping, structured chunks.
    Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
    Falls back to sentence-based chunking for normal paragraphs.

    Args:
        text (str): Input text.
        chunk_size (int): Max characters per chunk (default: 800).
        overlap (int): Overlapping characters for continuity (default: 200).

    Returns:
        list[str]: Chunked text segments.
    """
    # Clean and normalize text
    text = re.sub(r'\s+', ' ', text.strip())

    # Try to detect “Step” patterns
    step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
    step_splits = [s.strip() for s in step_splits if s.strip()]

    chunks = []

    # Case 1️⃣: Document has visible “Step” structure
    if len(step_splits) > 1:
        for step in step_splits:
            if len(step) > chunk_size:
                # If a step is too long → split by sentences within that step
                sentences = re.split(r'(?<=[.!?])\s+', step)
                current = ""
                for sent in sentences:
                    if len(current) + len(sent) + 1 <= chunk_size:
                        current += " " + sent
                    else:
                        if current.strip():
                            chunks.append(current.strip())
                        overlap_part = current[-overlap:] if overlap > 0 else ""
                        current = overlap_part + " " + sent
                if current.strip():
                    chunks.append(current.strip())
            else:
                chunks.append(step.strip())

    # Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
    else:
        sentences = re.split(r'(?<=[.!?])\s+', text)
        current = ""
        for sent in sentences:
            if len(current) + len(sent) + 1 <= chunk_size:
                current += " " + sent
            else:
                if current.strip():
                    chunks.append(current.strip())
                overlap_part = current[-overlap:] if overlap > 0 else ""
                current = overlap_part + " " + sent
        if current.strip():
            chunks.append(current.strip())

    return chunks


# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
    sample_text = """
    Step 1: Open the application.
    Step 2: Navigate to the dashboard.
    Step 3: Review the summary and click ‘Export’.
    If the steps are missing, the function should still chunk by sentences.
    """
    chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
    print(f"✅ Chunks created: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        print(f"\n--- Chunk {i} ---\n{c}")