hpmor / src /document_processor.py
deenaik's picture
Initial commit
6ef4823
"""Document processor for parsing and chunking HPMOR HTML."""
import re
import json
from pathlib import Path
from typing import List, Dict, Optional
from bs4 import BeautifulSoup
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from src.config import config
class HPMORProcessor:
"""Process HPMOR HTML document into chunks for RAG."""
def __init__(self):
self.chunk_size = config.chunk_size
self.chunk_overlap = config.chunk_overlap
self.processed_dir = config.processed_data_dir
def parse_html(self, file_path: Path) -> List[Dict]:
"""Parse HTML file and extract chapters with metadata."""
print(f"Parsing HTML file: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'lxml')
# Remove style and script tags
for tag in soup(['style', 'script']):
tag.decompose()
# Try to identify chapters by common patterns
chapters = []
chapter_pattern = re.compile(r'Chapter\s+(\d+)', re.IGNORECASE)
# Find all h1, h2, h3 tags that might be chapter headers
headers = soup.find_all(['h1', 'h2', 'h3'])
current_chapter = None
current_content = []
chapter_num = 0
for header in headers:
header_text = header.get_text(strip=True)
match = chapter_pattern.search(header_text)
if match:
# Save previous chapter if exists
if current_chapter and current_content:
chapters.append({
'chapter_number': current_chapter['number'],
'chapter_title': current_chapter['title'],
'content': '\n'.join(current_content)
})
# Start new chapter
chapter_num = int(match.group(1))
current_chapter = {
'number': chapter_num,
'title': header_text
}
current_content = []
# Get content after this header until next chapter
for sibling in header.find_next_siblings():
if sibling.name in ['h1', 'h2', 'h3']:
if chapter_pattern.search(sibling.get_text()):
break
text = sibling.get_text(strip=True)
if text:
current_content.append(text)
# Add the last chapter
if current_chapter and current_content:
chapters.append({
'chapter_number': current_chapter['number'],
'chapter_title': current_chapter['title'],
'content': '\n'.join(current_content)
})
# If no chapters found, treat entire content as one document
if not chapters:
print("No chapter structure found, processing as single document")
text_content = soup.get_text(separator='\n', strip=True)
chapters = [{
'chapter_number': 0,
'chapter_title': 'Harry Potter and the Methods of Rationality',
'content': text_content
}]
print(f"Extracted {len(chapters)} chapters")
return chapters
def create_chunks(self, chapters: List[Dict]) -> List[Document]:
"""Create overlapping chunks from chapters."""
print(f"Creating chunks with size={self.chunk_size}, overlap={self.chunk_overlap}")
documents = []
splitter = SentenceSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
)
for chapter in chapters:
# Create a document for the chapter
chapter_doc = Document(
text=chapter['content'],
metadata={
'chapter_number': chapter['chapter_number'],
'chapter_title': chapter['chapter_title'],
'source': 'hpmor.html'
}
)
# Split into chunks
nodes = splitter.get_nodes_from_documents([chapter_doc])
# Convert nodes back to documents with enhanced metadata
for i, node in enumerate(nodes):
doc = Document(
text=node.text,
metadata={
**chapter_doc.metadata,
'chunk_id': f"ch{chapter['chapter_number']}_chunk{i}",
'chunk_index': i,
'total_chunks_in_chapter': len(nodes)
}
)
documents.append(doc)
print(f"Created {len(documents)} chunks total")
return documents
def save_processed_data(self, documents: List[Document], chapters: List[Dict]) -> None:
"""Save processed documents and metadata to disk."""
# Save documents as JSON for easy loading
docs_data = []
for doc in documents:
docs_data.append({
'text': doc.text,
'metadata': doc.metadata
})
docs_file = self.processed_dir / 'documents.json'
with open(docs_file, 'w', encoding='utf-8') as f:
json.dump(docs_data, f, indent=2, ensure_ascii=False)
print(f"Saved {len(docs_data)} documents to {docs_file}")
# Save chapter metadata
chapters_file = self.processed_dir / 'chapters.json'
with open(chapters_file, 'w', encoding='utf-8') as f:
json.dump(chapters, f, indent=2, ensure_ascii=False)
print(f"Saved chapter metadata to {chapters_file}")
def load_processed_data(self) -> Optional[List[Document]]:
"""Load previously processed documents."""
docs_file = self.processed_dir / 'documents.json'
if not docs_file.exists():
return None
with open(docs_file, 'r', encoding='utf-8') as f:
docs_data = json.load(f)
documents = []
for doc_data in docs_data:
doc = Document(
text=doc_data['text'],
metadata=doc_data['metadata']
)
documents.append(doc)
print(f"Loaded {len(documents)} documents from cache")
return documents
def process(self, force_reprocess: bool = False) -> List[Document]:
"""Main processing pipeline."""
# Check if already processed
if not force_reprocess:
documents = self.load_processed_data()
if documents:
return documents
# Process from scratch
print("Processing HPMOR document from scratch...")
if not config.hpmor_file.exists():
raise FileNotFoundError(f"HPMOR file not found: {config.hpmor_file}")
# Parse HTML
chapters = self.parse_html(config.hpmor_file)
# Create chunks
documents = self.create_chunks(chapters)
# Save processed data
self.save_processed_data(documents, chapters)
return documents
def main():
"""Process HPMOR document."""
processor = HPMORProcessor()
documents = processor.process(force_reprocess=True)
print(f"\nProcessing complete! Created {len(documents)} document chunks.")
# Show sample
if documents:
print("\nSample chunk:")
print(f"Text: {documents[0].text[:200]}...")
print(f"Metadata: {documents[0].metadata}")
if __name__ == "__main__":
main()