File size: 956 Bytes
f499d4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from ingestion.pdf_loader import PDFLoader
from preprocessing.chunker import SemanticChunker


# Project-robust path handling (Windows-safe)

PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
PDF_DIR = os.path.join(PROJECT_ROOT, "data", "raw", "pdfs")

# Discover PDFs dynamically
pdf_files = [
    f for f in os.listdir(PDF_DIR)
    if f.lower().endswith(".pdf")
]

if not pdf_files:
    raise RuntimeError(f"No PDFs found in {PDF_DIR}")

PDF_PATH = os.path.join(PDF_DIR, pdf_files[0])

print(f"Using PDF: {PDF_PATH}")

# Load + chunk

loader = PDFLoader(PDF_PATH)
pdf_data = loader.load()

chunker = SemanticChunker()
chunks = chunker.chunk(pdf_data["pages"], pdf_data["doc_id"])

print(f"\nTotal chunks: {len(chunks)}\n")

for i, c in enumerate(chunks[:5], 1):
    print(f"--- Chunk {i} ---")
    print(f"Pages: {c['page_start']}{c['page_end']}")
    print(f"Tokens: {c['token_count']}")
    print(c["text"][:800])