"""Document processor for parsing and chunking HPMOR HTML."""

import re
import json
from pathlib import Path
from typing import List, Dict, Optional
from bs4 import BeautifulSoup
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from src.config import config


class HPMORProcessor:
    """Process HPMOR HTML document into chunks for RAG."""

    def __init__(self):
        self.chunk_size = config.chunk_size
        self.chunk_overlap = config.chunk_overlap
        self.processed_dir = config.processed_data_dir

    def parse_html(self, file_path: Path) -> List[Dict]:
        """Parse HTML file and extract chapters with metadata."""
        print(f"Parsing HTML file: {file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'lxml')

        # Remove style and script tags
        for tag in soup(['style', 'script']):
            tag.decompose()

        # Try to identify chapters by common patterns
        chapters = []
        chapter_pattern = re.compile(r'Chapter\s+(\d+)', re.IGNORECASE)

        # Find all h1, h2, h3 tags that might be chapter headers
        headers = soup.find_all(['h1', 'h2', 'h3'])

        current_chapter = None
        current_content = []
        chapter_num = 0

        for header in headers:
            header_text = header.get_text(strip=True)
            match = chapter_pattern.search(header_text)

            if match:
                # Save previous chapter if exists
                if current_chapter and current_content:
                    chapters.append({
                        'chapter_number': current_chapter['number'],
                        'chapter_title': current_chapter['title'],
                        'content': '\n'.join(current_content)
                    })

                # Start new chapter
                chapter_num = int(match.group(1))
                current_chapter = {
                    'number': chapter_num,
                    'title': header_text
                }
                current_content = []

                # Get content after this header until next chapter
                for sibling in header.find_next_siblings():
                    if sibling.name in ['h1', 'h2', 'h3']:
                        if chapter_pattern.search(sibling.get_text()):
                            break
                    text = sibling.get_text(strip=True)
                    if text:
                        current_content.append(text)

        # Add the last chapter
        if current_chapter and current_content:
            chapters.append({
                'chapter_number': current_chapter['number'],
                'chapter_title': current_chapter['title'],
                'content': '\n'.join(current_content)
            })

        # If no chapters found, treat entire content as one document
        if not chapters:
            print("No chapter structure found, processing as single document")
            text_content = soup.get_text(separator='\n', strip=True)
            chapters = [{
                'chapter_number': 0,
                'chapter_title': 'Harry Potter and the Methods of Rationality',
                'content': text_content
            }]

        print(f"Extracted {len(chapters)} chapters")
        return chapters

    def create_chunks(self, chapters: List[Dict]) -> List[Document]:
        """Create overlapping chunks from chapters."""
        print(f"Creating chunks with size={self.chunk_size}, overlap={self.chunk_overlap}")

        documents = []
        splitter = SentenceSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

        for chapter in chapters:
            # Create a document for the chapter
            chapter_doc = Document(
                text=chapter['content'],
                metadata={
                    'chapter_number': chapter['chapter_number'],
                    'chapter_title': chapter['chapter_title'],
                    'source': 'hpmor.html'
                }
            )

            # Split into chunks
            nodes = splitter.get_nodes_from_documents([chapter_doc])

            # Convert nodes back to documents with enhanced metadata
            for i, node in enumerate(nodes):
                doc = Document(
                    text=node.text,
                    metadata={
                        **chapter_doc.metadata,
                        'chunk_id': f"ch{chapter['chapter_number']}_chunk{i}",
                        'chunk_index': i,
                        'total_chunks_in_chapter': len(nodes)
                    }
                )
                documents.append(doc)

        print(f"Created {len(documents)} chunks total")
        return documents

    def save_processed_data(self, documents: List[Document], chapters: List[Dict]) -> None:
        """Save processed documents and metadata to disk."""
        # Save documents as JSON for easy loading
        docs_data = []
        for doc in documents:
            docs_data.append({
                'text': doc.text,
                'metadata': doc.metadata
            })

        docs_file = self.processed_dir / 'documents.json'
        with open(docs_file, 'w', encoding='utf-8') as f:
            json.dump(docs_data, f, indent=2, ensure_ascii=False)
        print(f"Saved {len(docs_data)} documents to {docs_file}")

        # Save chapter metadata
        chapters_file = self.processed_dir / 'chapters.json'
        with open(chapters_file, 'w', encoding='utf-8') as f:
            json.dump(chapters, f, indent=2, ensure_ascii=False)
        print(f"Saved chapter metadata to {chapters_file}")

    def load_processed_data(self) -> Optional[List[Document]]:
        """Load previously processed documents."""
        docs_file = self.processed_dir / 'documents.json'

        if not docs_file.exists():
            return None

        with open(docs_file, 'r', encoding='utf-8') as f:
            docs_data = json.load(f)

        documents = []
        for doc_data in docs_data:
            doc = Document(
                text=doc_data['text'],
                metadata=doc_data['metadata']
            )
            documents.append(doc)

        print(f"Loaded {len(documents)} documents from cache")
        return documents

    def process(self, force_reprocess: bool = False) -> List[Document]:
        """Main processing pipeline."""
        # Check if already processed
        if not force_reprocess:
            documents = self.load_processed_data()
            if documents:
                return documents

        # Process from scratch
        print("Processing HPMOR document from scratch...")

        if not config.hpmor_file.exists():
            raise FileNotFoundError(f"HPMOR file not found: {config.hpmor_file}")

        # Parse HTML
        chapters = self.parse_html(config.hpmor_file)

        # Create chunks
        documents = self.create_chunks(chapters)

        # Save processed data
        self.save_processed_data(documents, chapters)

        return documents


def main():
    """Process HPMOR document."""
    processor = HPMORProcessor()
    documents = processor.process(force_reprocess=True)
    print(f"\nProcessing complete! Created {len(documents)} document chunks.")

    # Show sample
    if documents:
        print("\nSample chunk:")
        print(f"Text: {documents[0].text[:200]}...")
        print(f"Metadata: {documents[0].metadata}")


if __name__ == "__main__":
    main()