dnd-rag-g / scripts /rag /ingest_dm_guide.py
alexchilton's picture
refactor: Organize scripts and notebooks into proper directories
44db119
#!/usr/bin/env python3
"""
DM Guide PDF Ingestion Script
Loads the entire DM Guide PDF into ChromaDB with intelligent chunking.
Chunks by page groups and section headers for optimal retrieval.
Usage:
python ingest_dm_guide.py [--clear]
"""
import argparse
import sys
import re
from pathlib import Path
from typing import List, Dict, Any
# Add project to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
# Import core infrastructure
from dnd_rag_system.core.chroma_manager import ChromaDBManager
from dnd_rag_system.core.base_chunker import Chunk
from dnd_rag_system.config import settings
def extract_text_from_pdf(pdf_path: Path) -> List[Dict[str, Any]]:
"""
Extract text from DM Guide PDF, organized by pages.
Returns:
List of dicts with page_number and text
"""
try:
import pdfplumber
except ImportError:
print("❌ pdfplumber not installed. Install with: pip install pdfplumber")
sys.exit(1)
print(f"πŸ“– Reading PDF: {pdf_path}")
pages_data = []
try:
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
print(f" Total pages: {total_pages}")
for i, page in enumerate(pdf.pages):
page_num = i + 1
# Extract text
text = page.extract_text()
if text and len(text.strip()) > 50: # Skip mostly empty pages
pages_data.append({
'page_number': page_num,
'text': text.strip()
})
# Progress indicator
if page_num % 50 == 0:
print(f" Processed {page_num}/{total_pages} pages...")
print(f"βœ“ Extracted text from {len(pages_data)} pages (skipped empty pages)")
return pages_data
except Exception as e:
print(f"❌ Error reading PDF: {e}")
sys.exit(1)
def detect_section_header(text: str) -> str:
"""
Try to detect if this page starts with a major section header.
Returns:
Section name if detected, empty string otherwise
"""
# Common DM Guide section patterns
lines = text.split('\n')[:5] # Check first 5 lines
for line in lines:
line_clean = line.strip()
# All caps lines that are short (likely headers)
if line_clean.isupper() and 5 < len(line_clean) < 60:
# Skip common non-headers
if line_clean not in ['CONTENTS', 'INDEX', 'PAGE']:
return line_clean.title()
# Chapter patterns
chapter_match = re.match(r'^(Chapter\s+\d+)[:\s]*(.+?)$', line_clean, re.IGNORECASE)
if chapter_match:
return f"{chapter_match.group(1)}: {chapter_match.group(2)}"
return ""
def create_chunks_from_pages(pages_data: List[Dict[str, Any]], pages_per_chunk: int = 3) -> List[Chunk]:
"""
Create chunks from page data.
Strategy:
- Group pages into chunks (default 3 pages per chunk for ~1500-2000 tokens)
- Detect section headers and create metadata
- Add page numbers for reference
Args:
pages_data: List of page dictionaries
pages_per_chunk: How many pages to combine per chunk
Returns:
List of Chunk objects
"""
chunks = []
current_section = "Introduction"
print(f"\nπŸ“¦ Creating chunks ({pages_per_chunk} pages per chunk)...")
i = 0
while i < len(pages_data):
# Get pages for this chunk
chunk_pages = pages_data[i:i + pages_per_chunk]
if not chunk_pages:
break
# Check if first page has a section header
first_page_text = chunk_pages[0]['text']
section_header = detect_section_header(first_page_text)
if section_header:
current_section = section_header
# Combine text from all pages in chunk
combined_text = "\n\n".join([
f"[Page {p['page_number']}]\n{p['text']}"
for p in chunk_pages
])
# Create metadata
page_numbers = [p['page_number'] for p in chunk_pages]
metadata = {
'source': 'dm_guide',
'section': current_section,
'page_start': page_numbers[0],
'page_end': page_numbers[-1],
'content_type': 'dm_guide'
}
# Create tags
tags = {'dm_guide', 'rules'}
# Add section-based tags
section_lower = current_section.lower()
if 'magic item' in section_lower or 'treasure' in section_lower:
tags.add('magic_items')
tags.add('treasure')
elif 'combat' in section_lower:
tags.add('combat')
elif 'monster' in section_lower or 'creature' in section_lower:
tags.add('monsters')
elif 'encounter' in section_lower:
tags.add('encounters')
# Create chunk with section header emphasized
chunk_content = f"DM GUIDE - {current_section}\n\n{combined_text}"
chunk = Chunk(
content=chunk_content,
chunk_type='dm_guide_section',
metadata=metadata,
tags=tags
)
chunks.append(chunk)
# Progress
if (len(chunks) % 20) == 0:
print(f" Created {len(chunks)} chunks (pages {page_numbers[0]}-{page_numbers[-1]})")
i += pages_per_chunk
print(f"βœ“ Created {len(chunks)} total chunks")
return chunks
def detect_magic_items_in_chunk(chunk: Chunk) -> bool:
"""
Heuristic to detect if a chunk likely contains magic item descriptions.
Updates chunk tags if detected.
"""
text_lower = chunk.content.lower()
# Magic item indicators
indicators = [
'wondrous item',
'requires attunement',
'uncommon',
'rare',
'very rare',
'legendary',
'ring of',
'cloak of',
'boots of',
'+1 ',
'+2 ',
'+3 ',
'potion of',
'scroll of'
]
# Count matches
matches = sum(1 for indicator in indicators if indicator in text_lower)
if matches >= 2: # At least 2 indicators = likely magic item content
chunk.tags.add('magic_items')
chunk.metadata['contains_magic_items'] = True
return True
return False
def load_dm_guide(db_manager: ChromaDBManager, clear: bool = False, pages_per_chunk: int = 3):
"""
Load DM Guide into ChromaDB.
Args:
db_manager: ChromaDB manager instance
clear: Whether to clear existing collection
pages_per_chunk: How many pages to combine per chunk
"""
print("\n" + "="*70)
print("πŸ“š LOADING DM GUIDE")
print("="*70)
collection_name = 'dm_guide'
# Clear if requested
if clear:
print(f"\nπŸ—‘οΈ Clearing existing '{collection_name}' collection...")
db_manager.clear_collection(collection_name)
# Check if PDF exists
pdf_path = Path(__file__).parent / "dnd_rag_system" / "data" / "reference" / "dm_guide.pdf"
if not pdf_path.exists():
print(f"❌ DM Guide PDF not found: {pdf_path}")
sys.exit(1)
# Extract text
pages_data = extract_text_from_pdf(pdf_path)
if not pages_data:
print("❌ No text extracted from PDF")
sys.exit(1)
# Create chunks
chunks = create_chunks_from_pages(pages_data, pages_per_chunk)
# Enhanced: Detect magic items in chunks
print("\nπŸ” Analyzing chunks for magic items...")
magic_item_chunks = 0
for chunk in chunks:
if detect_magic_items_in_chunk(chunk):
magic_item_chunks += 1
print(f"βœ“ Detected {magic_item_chunks} chunks containing magic items")
# Add to ChromaDB
if chunks:
print(f"\nπŸ’Ύ Adding {len(chunks)} chunks to ChromaDB...")
db_manager.add_chunks(collection_name, chunks)
print(f"βœ… Successfully loaded {len(chunks)} chunks into '{collection_name}' collection")
else:
print("❌ No chunks created")
sys.exit(1)
return len(chunks)
def main():
"""Main function."""
parser = argparse.ArgumentParser(description='Ingest DM Guide PDF into ChromaDB')
parser.add_argument('--clear', action='store_true', help='Clear existing dm_guide collection')
parser.add_argument('--pages-per-chunk', type=int, default=3,
help='Pages per chunk (default: 3, ~1500-2000 tokens)')
args = parser.parse_args()
print("\n" + "="*70)
print("🎲 DM GUIDE INGESTION")
print("="*70)
# Initialize ChromaDB
print("\nπŸ”§ Initializing ChromaDB...")
db_manager = ChromaDBManager()
# Load DM Guide
chunk_count = load_dm_guide(
db_manager,
clear=args.clear,
pages_per_chunk=args.pages_per_chunk
)
# Show stats
print("\n" + "="*70)
print("πŸ“Š INGESTION SUMMARY")
print("="*70)
print(f" Total chunks: {chunk_count}")
print(f" Pages per chunk: {args.pages_per_chunk}")
# Collection stats
print("\nπŸ“ˆ Collection Statistics:")
stats = db_manager.get_collection_stats('dm_guide')
print(f" dm_guide: {stats.get('total_documents', 0)} documents")
if stats.get('chunk_types'):
print("\n Chunk types:")
for chunk_type, count in stats['chunk_types'].items():
print(f" {chunk_type}: {count}")
print("\nπŸŽ‰ DM Guide ingestion complete!")
print(f" Database: {db_manager.persist_dir}")
print("\nπŸ’‘ Next steps:")
print(" - Test search: python query_rag.py")
print(" - Query example: 'Ring of Protection'")
print(" - Query example: 'magic items for wizards'")
if __name__ == '__main__':
main()