Spaces:

deenaik
/

hpmor

Build error

App Files Files Community

hpmor / src /document_processor.py

deenaik

Initial commit

6ef4823 4 months ago

raw

history blame contribute delete

7.61 kB

	"""Document processor for parsing and chunking HPMOR HTML."""

	import re
	import json
	from pathlib import Path
	from typing import List, Dict, Optional
	from bs4 import BeautifulSoup
	from llama_index.core import Document
	from llama_index.core.node_parser import SentenceSplitter
	from src.config import config


	class HPMORProcessor:
	"""Process HPMOR HTML document into chunks for RAG."""

	def __init__(self):
	self.chunk_size = config.chunk_size
	self.chunk_overlap = config.chunk_overlap
	self.processed_dir = config.processed_data_dir

	def parse_html(self, file_path: Path) -> List[Dict]:
	"""Parse HTML file and extract chapters with metadata."""
	print(f"Parsing HTML file: {file_path}")

	with open(file_path, 'r', encoding='utf-8') as f:
	html_content = f.read()

	soup = BeautifulSoup(html_content, 'lxml')

	# Remove style and script tags
	for tag in soup(['style', 'script']):
	tag.decompose()

	# Try to identify chapters by common patterns
	chapters = []
	chapter_pattern = re.compile(r'Chapter\s+(\d+)', re.IGNORECASE)

	# Find all h1, h2, h3 tags that might be chapter headers
	headers = soup.find_all(['h1', 'h2', 'h3'])

	current_chapter = None
	current_content = []
	chapter_num = 0

	for header in headers:
	header_text = header.get_text(strip=True)
	match = chapter_pattern.search(header_text)

	if match:
	# Save previous chapter if exists
	if current_chapter and current_content:
	chapters.append({
	'chapter_number': current_chapter['number'],
	'chapter_title': current_chapter['title'],
	'content': '\n'.join(current_content)
	})

	# Start new chapter
	chapter_num = int(match.group(1))
	current_chapter = {
	'number': chapter_num,
	'title': header_text
	}
	current_content = []

	# Get content after this header until next chapter
	for sibling in header.find_next_siblings():
	if sibling.name in ['h1', 'h2', 'h3']:
	if chapter_pattern.search(sibling.get_text()):
	break
	text = sibling.get_text(strip=True)
	if text:
	current_content.append(text)

	# Add the last chapter
	if current_chapter and current_content:
	chapters.append({
	'chapter_number': current_chapter['number'],
	'chapter_title': current_chapter['title'],
	'content': '\n'.join(current_content)
	})

	# If no chapters found, treat entire content as one document
	if not chapters:
	print("No chapter structure found, processing as single document")
	text_content = soup.get_text(separator='\n', strip=True)
	chapters = [{
	'chapter_number': 0,
	'chapter_title': 'Harry Potter and the Methods of Rationality',
	'content': text_content
	}]

	print(f"Extracted {len(chapters)} chapters")
	return chapters

	def create_chunks(self, chapters: List[Dict]) -> List[Document]:
	"""Create overlapping chunks from chapters."""
	print(f"Creating chunks with size={self.chunk_size}, overlap={self.chunk_overlap}")

	documents = []
	splitter = SentenceSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	)

	for chapter in chapters:
	# Create a document for the chapter
	chapter_doc = Document(
	text=chapter['content'],
	metadata={
	'chapter_number': chapter['chapter_number'],
	'chapter_title': chapter['chapter_title'],
	'source': 'hpmor.html'
	}
	)

	# Split into chunks
	nodes = splitter.get_nodes_from_documents([chapter_doc])

	# Convert nodes back to documents with enhanced metadata
	for i, node in enumerate(nodes):
	doc = Document(
	text=node.text,
	metadata={
	**chapter_doc.metadata,
	'chunk_id': f"ch{chapter['chapter_number']}_chunk{i}",
	'chunk_index': i,
	'total_chunks_in_chapter': len(nodes)
	}
	)
	documents.append(doc)

	print(f"Created {len(documents)} chunks total")
	return documents

	def save_processed_data(self, documents: List[Document], chapters: List[Dict]) -> None:
	"""Save processed documents and metadata to disk."""
	# Save documents as JSON for easy loading
	docs_data = []
	for doc in documents:
	docs_data.append({
	'text': doc.text,
	'metadata': doc.metadata
	})

	docs_file = self.processed_dir / 'documents.json'
	with open(docs_file, 'w', encoding='utf-8') as f:
	json.dump(docs_data, f, indent=2, ensure_ascii=False)
	print(f"Saved {len(docs_data)} documents to {docs_file}")

	# Save chapter metadata
	chapters_file = self.processed_dir / 'chapters.json'
	with open(chapters_file, 'w', encoding='utf-8') as f:
	json.dump(chapters, f, indent=2, ensure_ascii=False)
	print(f"Saved chapter metadata to {chapters_file}")

	def load_processed_data(self) -> Optional[List[Document]]:
	"""Load previously processed documents."""
	docs_file = self.processed_dir / 'documents.json'

	if not docs_file.exists():
	return None

	with open(docs_file, 'r', encoding='utf-8') as f:
	docs_data = json.load(f)

	documents = []
	for doc_data in docs_data:
	doc = Document(
	text=doc_data['text'],
	metadata=doc_data['metadata']
	)
	documents.append(doc)

	print(f"Loaded {len(documents)} documents from cache")
	return documents

	def process(self, force_reprocess: bool = False) -> List[Document]:
	"""Main processing pipeline."""
	# Check if already processed
	if not force_reprocess:
	documents = self.load_processed_data()
	if documents:
	return documents

	# Process from scratch
	print("Processing HPMOR document from scratch...")

	if not config.hpmor_file.exists():
	raise FileNotFoundError(f"HPMOR file not found: {config.hpmor_file}")

	# Parse HTML
	chapters = self.parse_html(config.hpmor_file)

	# Create chunks
	documents = self.create_chunks(chapters)

	# Save processed data
	self.save_processed_data(documents, chapters)

	return documents


	def main():
	"""Process HPMOR document."""
	processor = HPMORProcessor()
	documents = processor.process(force_reprocess=True)
	print(f"\nProcessing complete! Created {len(documents)} document chunks.")

	# Show sample
	if documents:
	print("\nSample chunk:")
	print(f"Text: {documents[0].text[:200]}...")
	print(f"Metadata: {documents[0].metadata}")


	if __name__ == "__main__":
	main()