import re
import fitz  # PyMuPDF

# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts and cleans text from a PDF using PyMuPDF.
    Handles both textual and scanned PDFs gracefully.

    Args:
        file_path (str): Path to the PDF file.
    Returns:
        str: Combined extracted text.
    """
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                page_text = page.get_text("text").strip()
                if not page_text:
                    # Fallback: extract raw blocks (helps with weird PDFs)
                    blocks = page.get_text("blocks")
                    page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                text += page_text + "\n"
    except Exception as e:
        raise RuntimeError(f"❌ PDF extraction failed: {e}")

    # Clean out any extra whitespace or control characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# -----------------------------
# SMART CHUNKING (Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
    """
    Splits text into overlapping, sentence-based chunks.
    Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.

    Args:
        text (str): Input text.
        chunk_size (int): Max characters per chunk (default: 800).
        overlap (int): Overlapping characters for continuity (default: 150).

    Returns:
        list[str]: Chunked text segments.
    """
    # Clean text once
    text = re.sub(r'\s+', ' ', text.strip())

    # Sentence segmentation (simple rule-based, fast)
    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks, current = [], ""

    for sent in sentences:
        if len(current) + len(sent) + 1 <= chunk_size:
            current += " " + sent
        else:
            # Store full chunk
            if current.strip():
                chunks.append(current.strip())

            # Overlap control
            overlap_part = current[-overlap:] if overlap > 0 else ""
            current = overlap_part + " " + sent

    # Append the last chunk
    if current.strip():
        chunks.append(current.strip())

    return chunks


# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
    sample_text = """
    Artificial Intelligence is transforming industries.
    Machine learning is a key subfield, driving automation and predictive analytics.
    Neural networks power most modern AI applications today.
    This technology is reshaping healthcare, finance, and manufacturing.
    """
    chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
    print(f"✅ Chunks created: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")