import os import json import nltk from langchain.text_splitter import RecursiveCharacterTextSplitter from pdf_utils import ( extract_text_with_tables, extract_images_pymupdf, extract_images_with_captions, extract_full_page_images ) # Get project root dynamically (3 levels up from current file) BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) DATA_DIR = os.path.join(BASE_DIR, "Artifacts") RAW_PDF_DIR = os.path.join(DATA_DIR, "raw_pdf") PROCESSED_TEXT_DIR = os.path.join(DATA_DIR, "processed_text") EMBEDDINGS_DIR = os.path.join(DATA_DIR, "embeddings") PAGE_IMAGES_DIR = os.path.join(DATA_DIR, "page_images") IMAGE_PATH = os.path.join(DATA_DIR, "images") IMAGE_CAPTIONS_DIR = os.path.join(DATA_DIR, "image_with_captions") pdf_path = os.path.join(RAW_PDF_DIR, "medical_book.pdf") OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json") os.makedirs(PROCESSED_TEXT_DIR, exist_ok=True) # ---------------- CHUNKING ---------------- def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"): """ Chunk combined text (text + tables) into smaller parts. Args: pages_data (list): Extracted pages with combined text/tables. pdf_path (str): Path to the PDF file. chunk_size (int): Size of each chunk (only for recursive). overlap (int): Overlap between chunks (only for recursive). mode (str): "recursive" or "sentence". Returns: list: List of chunk metadata dictionaries. """ formatted_chunks = [] if mode == "recursive": # Recursive character splitter splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) for page in pages_data: chunks = splitter.split_text(page["content"]) for i, chunk in enumerate(chunks): formatted_chunks.append({ "chunk_id": f"{page['page_num']}_{i}", "page_num": page["page_num"], "content": chunk.strip(), "pdf_file": os.path.basename(pdf_path), "images": [] }) elif mode == "sentence": # Sentence splitting with merging short sentences for page in pages_data: sentences = nltk.sent_tokenize(page["content"]) buffer = "" for i, sentence in enumerate(sentences): # Merge short sentences (< 50 chars) if len(buffer) + len(sentence) < 50: buffer += " " + sentence else: if buffer: formatted_chunks.append({ "chunk_id": f"{page['page_num']}_{i}", "page_num": page["page_num"], "content": buffer.strip(), "pdf_file": os.path.basename(pdf_path), "images": [] }) buffer = sentence if buffer: formatted_chunks.append({ "chunk_id": f"{page['page_num']}_{len(sentences)}", "page_num": page["page_num"], "content": buffer.strip(), "pdf_file": os.path.basename(pdf_path), "images": [] }) else: raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.") return formatted_chunks # ---------------- MERGE TEXT + IMAGES ---------------- def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map, # caption_map ): """ Add extracted images, page snapshots, and captions to chunks. """ for chunk in chunks: page_num = chunk["page_num"] # Add inline figure images chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])] # Add page snapshot chunk["page_snapshot"] = page_snapshot_map.get(page_num) # Add captions (list of {image_path, caption_text}) # chunk["captions"] = caption_map.get(page_num, []) return chunks # ---------------- SAVE JSON ---------------- def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH): """ Save the final chunked data with tables + images to JSON. """ print(output_path) with open(output_path, "w", encoding="utf-8") as f: json.dump(final_data, f, ensure_ascii=False, indent=4) print(f"Saved chunks metadata to: {output_path}") if __name__ == "__main__": pdf_path = "Artifacts/raw_pdf/medical_book.pdf" combined_pages, logs = extract_text_with_tables(pdf_path) text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence") image_map = extract_images_pymupdf(pdf_path, IMAGE_PATH) # caption_map = extract_images_with_captions(pdf_path) page_snapshot_map = extract_full_page_images(pdf_path) final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map, # caption_map ) save_chunks_to_json(final_data, OUTPUT_JSON_PATH)