Spaces:
Sleeping
Sleeping
| import os | |
| from ingestion.pdf_loader import PDFLoader | |
| from preprocessing.chunker import SemanticChunker | |
| # Project-robust path handling (Windows-safe) | |
| PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| PDF_DIR = os.path.join(PROJECT_ROOT, "data", "raw", "pdfs") | |
| # Discover PDFs dynamically | |
| pdf_files = [ | |
| f for f in os.listdir(PDF_DIR) | |
| if f.lower().endswith(".pdf") | |
| ] | |
| if not pdf_files: | |
| raise RuntimeError(f"No PDFs found in {PDF_DIR}") | |
| PDF_PATH = os.path.join(PDF_DIR, pdf_files[0]) | |
| print(f"Using PDF: {PDF_PATH}") | |
| # Load + chunk | |
| loader = PDFLoader(PDF_PATH) | |
| pdf_data = loader.load() | |
| chunker = SemanticChunker() | |
| chunks = chunker.chunk(pdf_data["pages"], pdf_data["doc_id"]) | |
| print(f"\nTotal chunks: {len(chunks)}\n") | |
| for i, c in enumerate(chunks[:5], 1): | |
| print(f"--- Chunk {i} ---") | |
| print(f"Pages: {c['page_start']}–{c['page_end']}") | |
| print(f"Tokens: {c['token_count']}") | |
| print(c["text"][:800]) | |