File size: 9,758 Bytes
021570c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44db119
021570c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7504776
021570c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env python3
"""
DM Guide PDF Ingestion Script

Loads the entire DM Guide PDF into ChromaDB with intelligent chunking.
Chunks by page groups and section headers for optimal retrieval.

Usage:
    python ingest_dm_guide.py [--clear]
"""

import argparse
import sys
import re
from pathlib import Path
from typing import List, Dict, Any

# Add project to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

# Import core infrastructure
from dnd_rag_system.core.chroma_manager import ChromaDBManager
from dnd_rag_system.core.base_chunker import Chunk
from dnd_rag_system.config import settings


def extract_text_from_pdf(pdf_path: Path) -> List[Dict[str, Any]]:
    """
    Extract text from DM Guide PDF, organized by pages.

    Returns:
        List of dicts with page_number and text
    """
    try:
        import pdfplumber
    except ImportError:
        print("❌ pdfplumber not installed. Install with: pip install pdfplumber")
        sys.exit(1)

    print(f"πŸ“– Reading PDF: {pdf_path}")

    pages_data = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            print(f"   Total pages: {total_pages}")

            for i, page in enumerate(pdf.pages):
                page_num = i + 1

                # Extract text
                text = page.extract_text()

                if text and len(text.strip()) > 50:  # Skip mostly empty pages
                    pages_data.append({
                        'page_number': page_num,
                        'text': text.strip()
                    })

                # Progress indicator
                if page_num % 50 == 0:
                    print(f"   Processed {page_num}/{total_pages} pages...")

        print(f"βœ“ Extracted text from {len(pages_data)} pages (skipped empty pages)")
        return pages_data

    except Exception as e:
        print(f"❌ Error reading PDF: {e}")
        sys.exit(1)


def detect_section_header(text: str) -> str:
    """
    Try to detect if this page starts with a major section header.

    Returns:
        Section name if detected, empty string otherwise
    """
    # Common DM Guide section patterns
    lines = text.split('\n')[:5]  # Check first 5 lines

    for line in lines:
        line_clean = line.strip()

        # All caps lines that are short (likely headers)
        if line_clean.isupper() and 5 < len(line_clean) < 60:
            # Skip common non-headers
            if line_clean not in ['CONTENTS', 'INDEX', 'PAGE']:
                return line_clean.title()

        # Chapter patterns
        chapter_match = re.match(r'^(Chapter\s+\d+)[:\s]*(.+?)$', line_clean, re.IGNORECASE)
        if chapter_match:
            return f"{chapter_match.group(1)}: {chapter_match.group(2)}"

    return ""


def create_chunks_from_pages(pages_data: List[Dict[str, Any]], pages_per_chunk: int = 3) -> List[Chunk]:
    """
    Create chunks from page data.

    Strategy:
    - Group pages into chunks (default 3 pages per chunk for ~1500-2000 tokens)
    - Detect section headers and create metadata
    - Add page numbers for reference

    Args:
        pages_data: List of page dictionaries
        pages_per_chunk: How many pages to combine per chunk

    Returns:
        List of Chunk objects
    """
    chunks = []
    current_section = "Introduction"

    print(f"\nπŸ“¦ Creating chunks ({pages_per_chunk} pages per chunk)...")

    i = 0
    while i < len(pages_data):
        # Get pages for this chunk
        chunk_pages = pages_data[i:i + pages_per_chunk]

        if not chunk_pages:
            break

        # Check if first page has a section header
        first_page_text = chunk_pages[0]['text']
        section_header = detect_section_header(first_page_text)

        if section_header:
            current_section = section_header

        # Combine text from all pages in chunk
        combined_text = "\n\n".join([
            f"[Page {p['page_number']}]\n{p['text']}"
            for p in chunk_pages
        ])

        # Create metadata
        page_numbers = [p['page_number'] for p in chunk_pages]
        metadata = {
            'source': 'dm_guide',
            'section': current_section,
            'page_start': page_numbers[0],
            'page_end': page_numbers[-1],
            'content_type': 'dm_guide'
        }

        # Create tags
        tags = {'dm_guide', 'rules'}

        # Add section-based tags
        section_lower = current_section.lower()
        if 'magic item' in section_lower or 'treasure' in section_lower:
            tags.add('magic_items')
            tags.add('treasure')
        elif 'combat' in section_lower:
            tags.add('combat')
        elif 'monster' in section_lower or 'creature' in section_lower:
            tags.add('monsters')
        elif 'encounter' in section_lower:
            tags.add('encounters')

        # Create chunk with section header emphasized
        chunk_content = f"DM GUIDE - {current_section}\n\n{combined_text}"

        chunk = Chunk(
            content=chunk_content,
            chunk_type='dm_guide_section',
            metadata=metadata,
            tags=tags
        )

        chunks.append(chunk)

        # Progress
        if (len(chunks) % 20) == 0:
            print(f"   Created {len(chunks)} chunks (pages {page_numbers[0]}-{page_numbers[-1]})")

        i += pages_per_chunk

    print(f"βœ“ Created {len(chunks)} total chunks")
    return chunks


def detect_magic_items_in_chunk(chunk: Chunk) -> bool:
    """
    Heuristic to detect if a chunk likely contains magic item descriptions.
    Updates chunk tags if detected.
    """
    text_lower = chunk.content.lower()

    # Magic item indicators
    indicators = [
        'wondrous item',
        'requires attunement',
        'uncommon',
        'rare',
        'very rare',
        'legendary',
        'ring of',
        'cloak of',
        'boots of',
        '+1 ',
        '+2 ',
        '+3 ',
        'potion of',
        'scroll of'
    ]

    # Count matches
    matches = sum(1 for indicator in indicators if indicator in text_lower)

    if matches >= 2:  # At least 2 indicators = likely magic item content
        chunk.tags.add('magic_items')
        chunk.metadata['contains_magic_items'] = True
        return True

    return False


def load_dm_guide(db_manager: ChromaDBManager, clear: bool = False, pages_per_chunk: int = 3):
    """
    Load DM Guide into ChromaDB.

    Args:
        db_manager: ChromaDB manager instance
        clear: Whether to clear existing collection
        pages_per_chunk: How many pages to combine per chunk
    """
    print("\n" + "="*70)
    print("πŸ“š LOADING DM GUIDE")
    print("="*70)

    collection_name = 'dm_guide'

    # Clear if requested
    if clear:
        print(f"\nπŸ—‘οΈ  Clearing existing '{collection_name}' collection...")
        db_manager.clear_collection(collection_name)

    # Check if PDF exists
    pdf_path = Path(__file__).parent / "dnd_rag_system" / "data" / "reference" / "dm_guide.pdf"

    if not pdf_path.exists():
        print(f"❌ DM Guide PDF not found: {pdf_path}")
        sys.exit(1)

    # Extract text
    pages_data = extract_text_from_pdf(pdf_path)

    if not pages_data:
        print("❌ No text extracted from PDF")
        sys.exit(1)

    # Create chunks
    chunks = create_chunks_from_pages(pages_data, pages_per_chunk)

    # Enhanced: Detect magic items in chunks
    print("\nπŸ” Analyzing chunks for magic items...")
    magic_item_chunks = 0
    for chunk in chunks:
        if detect_magic_items_in_chunk(chunk):
            magic_item_chunks += 1
    print(f"βœ“ Detected {magic_item_chunks} chunks containing magic items")

    # Add to ChromaDB
    if chunks:
        print(f"\nπŸ’Ύ Adding {len(chunks)} chunks to ChromaDB...")
        db_manager.add_chunks(collection_name, chunks)
        print(f"βœ… Successfully loaded {len(chunks)} chunks into '{collection_name}' collection")
    else:
        print("❌ No chunks created")
        sys.exit(1)

    return len(chunks)


def main():
    """Main function."""
    parser = argparse.ArgumentParser(description='Ingest DM Guide PDF into ChromaDB')
    parser.add_argument('--clear', action='store_true', help='Clear existing dm_guide collection')
    parser.add_argument('--pages-per-chunk', type=int, default=3,
                       help='Pages per chunk (default: 3, ~1500-2000 tokens)')
    args = parser.parse_args()

    print("\n" + "="*70)
    print("🎲 DM GUIDE INGESTION")
    print("="*70)

    # Initialize ChromaDB
    print("\nπŸ”§ Initializing ChromaDB...")
    db_manager = ChromaDBManager()

    # Load DM Guide
    chunk_count = load_dm_guide(
        db_manager,
        clear=args.clear,
        pages_per_chunk=args.pages_per_chunk
    )

    # Show stats
    print("\n" + "="*70)
    print("πŸ“Š INGESTION SUMMARY")
    print("="*70)
    print(f"  Total chunks: {chunk_count}")
    print(f"  Pages per chunk: {args.pages_per_chunk}")

    # Collection stats
    print("\nπŸ“ˆ Collection Statistics:")
    stats = db_manager.get_collection_stats('dm_guide')
    print(f"  dm_guide: {stats.get('total_documents', 0)} documents")

    if stats.get('chunk_types'):
        print("\n  Chunk types:")
        for chunk_type, count in stats['chunk_types'].items():
            print(f"    {chunk_type}: {count}")

    print("\nπŸŽ‰ DM Guide ingestion complete!")
    print(f"   Database: {db_manager.persist_dir}")

    print("\nπŸ’‘ Next steps:")
    print("   - Test search: python query_rag.py")
    print("   - Query example: 'Ring of Protection'")
    print("   - Query example: 'magic items for wizards'")


if __name__ == '__main__':
    main()