Spaces:

alexchilton
/

dnd-rag-g

Build error

App Files Files Community

dnd-rag-g / scripts /rag /ingest_dm_guide.py

alexchilton

refactor: Organize scripts and notebooks into proper directories

44db119 6 days ago

raw

history blame contribute delete

9.76 kB

	#!/usr/bin/env python3
	"""
	DM Guide PDF Ingestion Script

	Loads the entire DM Guide PDF into ChromaDB with intelligent chunking.
	Chunks by page groups and section headers for optimal retrieval.

	Usage:
	python ingest_dm_guide.py [--clear]
	"""

	import argparse
	import sys
	import re
	from pathlib import Path
	from typing import List, Dict, Any

	# Add project to path
	project_root = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(project_root))

	# Import core infrastructure
	from dnd_rag_system.core.chroma_manager import ChromaDBManager
	from dnd_rag_system.core.base_chunker import Chunk
	from dnd_rag_system.config import settings


	def extract_text_from_pdf(pdf_path: Path) -> List[Dict[str, Any]]:
	"""
	Extract text from DM Guide PDF, organized by pages.

	Returns:
	List of dicts with page_number and text
	"""
	try:
	import pdfplumber
	except ImportError:
	print("❌ pdfplumber not installed. Install with: pip install pdfplumber")
	sys.exit(1)

	print(f"📖 Reading PDF: {pdf_path}")

	pages_data = []

	try:
	with pdfplumber.open(pdf_path) as pdf:
	total_pages = len(pdf.pages)
	print(f" Total pages: {total_pages}")

	for i, page in enumerate(pdf.pages):
	page_num = i + 1

	# Extract text
	text = page.extract_text()

	if text and len(text.strip()) > 50: # Skip mostly empty pages
	pages_data.append({
	'page_number': page_num,
	'text': text.strip()
	})

	# Progress indicator
	if page_num % 50 == 0:
	print(f" Processed {page_num}/{total_pages} pages...")

	print(f"✓ Extracted text from {len(pages_data)} pages (skipped empty pages)")
	return pages_data

	except Exception as e:
	print(f"❌ Error reading PDF: {e}")
	sys.exit(1)


	def detect_section_header(text: str) -> str:
	"""
	Try to detect if this page starts with a major section header.

	Returns:
	Section name if detected, empty string otherwise
	"""
	# Common DM Guide section patterns
	lines = text.split('\n')[:5] # Check first 5 lines

	for line in lines:
	line_clean = line.strip()

	# All caps lines that are short (likely headers)
	if line_clean.isupper() and 5 < len(line_clean) < 60:
	# Skip common non-headers
	if line_clean not in ['CONTENTS', 'INDEX', 'PAGE']:
	return line_clean.title()

	# Chapter patterns
	chapter_match = re.match(r'^(Chapter\s+\d+)[:\s]*(.+?)$', line_clean, re.IGNORECASE)
	if chapter_match:
	return f"{chapter_match.group(1)}: {chapter_match.group(2)}"

	return ""


	def create_chunks_from_pages(pages_data: List[Dict[str, Any]], pages_per_chunk: int = 3) -> List[Chunk]:
	"""
	Create chunks from page data.

	Strategy:
	- Group pages into chunks (default 3 pages per chunk for ~1500-2000 tokens)
	- Detect section headers and create metadata
	- Add page numbers for reference

	Args:
	pages_data: List of page dictionaries
	pages_per_chunk: How many pages to combine per chunk

	Returns:
	List of Chunk objects
	"""
	chunks = []
	current_section = "Introduction"

	print(f"\n📦 Creating chunks ({pages_per_chunk} pages per chunk)...")

	i = 0
	while i < len(pages_data):
	# Get pages for this chunk
	chunk_pages = pages_data[i:i + pages_per_chunk]

	if not chunk_pages:
	break

	# Check if first page has a section header
	first_page_text = chunk_pages[0]['text']
	section_header = detect_section_header(first_page_text)

	if section_header:
	current_section = section_header

	# Combine text from all pages in chunk
	combined_text = "\n\n".join([
	f"[Page {p['page_number']}]\n{p['text']}"
	for p in chunk_pages
	])

	# Create metadata
	page_numbers = [p['page_number'] for p in chunk_pages]
	metadata = {
	'source': 'dm_guide',
	'section': current_section,
	'page_start': page_numbers[0],
	'page_end': page_numbers[-1],
	'content_type': 'dm_guide'
	}

	# Create tags
	tags = {'dm_guide', 'rules'}

	# Add section-based tags
	section_lower = current_section.lower()
	if 'magic item' in section_lower or 'treasure' in section_lower:
	tags.add('magic_items')
	tags.add('treasure')
	elif 'combat' in section_lower:
	tags.add('combat')
	elif 'monster' in section_lower or 'creature' in section_lower:
	tags.add('monsters')
	elif 'encounter' in section_lower:
	tags.add('encounters')

	# Create chunk with section header emphasized
	chunk_content = f"DM GUIDE - {current_section}\n\n{combined_text}"

	chunk = Chunk(
	content=chunk_content,
	chunk_type='dm_guide_section',
	metadata=metadata,
	tags=tags
	)

	chunks.append(chunk)

	# Progress
	if (len(chunks) % 20) == 0:
	print(f" Created {len(chunks)} chunks (pages {page_numbers[0]}-{page_numbers[-1]})")

	i += pages_per_chunk

	print(f"✓ Created {len(chunks)} total chunks")
	return chunks


	def detect_magic_items_in_chunk(chunk: Chunk) -> bool:
	"""
	Heuristic to detect if a chunk likely contains magic item descriptions.
	Updates chunk tags if detected.
	"""
	text_lower = chunk.content.lower()

	# Magic item indicators
	indicators = [
	'wondrous item',
	'requires attunement',
	'uncommon',
	'rare',
	'very rare',
	'legendary',
	'ring of',
	'cloak of',
	'boots of',
	'+1 ',
	'+2 ',
	'+3 ',
	'potion of',
	'scroll of'
	]

	# Count matches
	matches = sum(1 for indicator in indicators if indicator in text_lower)

	if matches >= 2: # At least 2 indicators = likely magic item content
	chunk.tags.add('magic_items')
	chunk.metadata['contains_magic_items'] = True
	return True

	return False


	def load_dm_guide(db_manager: ChromaDBManager, clear: bool = False, pages_per_chunk: int = 3):
	"""
	Load DM Guide into ChromaDB.

	Args:
	db_manager: ChromaDB manager instance
	clear: Whether to clear existing collection
	pages_per_chunk: How many pages to combine per chunk
	"""
	print("\n" + "="*70)
	print("📚 LOADING DM GUIDE")
	print("="*70)

	collection_name = 'dm_guide'

	# Clear if requested
	if clear:
	print(f"\n🗑️ Clearing existing '{collection_name}' collection...")
	db_manager.clear_collection(collection_name)

	# Check if PDF exists
	pdf_path = Path(__file__).parent / "dnd_rag_system" / "data" / "reference" / "dm_guide.pdf"

	if not pdf_path.exists():
	print(f"❌ DM Guide PDF not found: {pdf_path}")
	sys.exit(1)

	# Extract text
	pages_data = extract_text_from_pdf(pdf_path)

	if not pages_data:
	print("❌ No text extracted from PDF")
	sys.exit(1)

	# Create chunks
	chunks = create_chunks_from_pages(pages_data, pages_per_chunk)

	# Enhanced: Detect magic items in chunks
	print("\n🔍 Analyzing chunks for magic items...")
	magic_item_chunks = 0
	for chunk in chunks:
	if detect_magic_items_in_chunk(chunk):
	magic_item_chunks += 1
	print(f"✓ Detected {magic_item_chunks} chunks containing magic items")

	# Add to ChromaDB
	if chunks:
	print(f"\n💾 Adding {len(chunks)} chunks to ChromaDB...")
	db_manager.add_chunks(collection_name, chunks)
	print(f"✅ Successfully loaded {len(chunks)} chunks into '{collection_name}' collection")
	else:
	print("❌ No chunks created")
	sys.exit(1)

	return len(chunks)


	def main():
	"""Main function."""
	parser = argparse.ArgumentParser(description='Ingest DM Guide PDF into ChromaDB')
	parser.add_argument('--clear', action='store_true', help='Clear existing dm_guide collection')
	parser.add_argument('--pages-per-chunk', type=int, default=3,
	help='Pages per chunk (default: 3, ~1500-2000 tokens)')
	args = parser.parse_args()

	print("\n" + "="*70)
	print("🎲 DM GUIDE INGESTION")
	print("="*70)

	# Initialize ChromaDB
	print("\n🔧 Initializing ChromaDB...")
	db_manager = ChromaDBManager()

	# Load DM Guide
	chunk_count = load_dm_guide(
	db_manager,
	clear=args.clear,
	pages_per_chunk=args.pages_per_chunk
	)

	# Show stats
	print("\n" + "="*70)
	print("📊 INGESTION SUMMARY")
	print("="*70)
	print(f" Total chunks: {chunk_count}")
	print(f" Pages per chunk: {args.pages_per_chunk}")

	# Collection stats
	print("\n📈 Collection Statistics:")
	stats = db_manager.get_collection_stats('dm_guide')
	print(f" dm_guide: {stats.get('total_documents', 0)} documents")

	if stats.get('chunk_types'):
	print("\n Chunk types:")
	for chunk_type, count in stats['chunk_types'].items():
	print(f" {chunk_type}: {count}")

	print("\n🎉 DM Guide ingestion complete!")
	print(f" Database: {db_manager.persist_dir}")

	print("\n💡 Next steps:")
	print(" - Test search: python query_rag.py")
	print(" - Query example: 'Ring of Protection'")
	print(" - Query example: 'magic items for wizards'")


	if __name__ == '__main__':
	main()