File size: 5,339 Bytes
1713970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import json
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pdf_utils import (
    extract_text_with_tables,
    extract_images_pymupdf,
    extract_images_with_captions,
    extract_full_page_images
)

# Get project root dynamically (3 levels up from current file)
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))

DATA_DIR = os.path.join(BASE_DIR, "Artifacts")
RAW_PDF_DIR = os.path.join(DATA_DIR, "raw_pdf")
PROCESSED_TEXT_DIR = os.path.join(DATA_DIR, "processed_text")
EMBEDDINGS_DIR = os.path.join(DATA_DIR, "embeddings")
PAGE_IMAGES_DIR = os.path.join(DATA_DIR, "page_images")
IMAGE_PATH = os.path.join(DATA_DIR, "images")
IMAGE_CAPTIONS_DIR = os.path.join(DATA_DIR, "image_with_captions")
pdf_path = os.path.join(RAW_PDF_DIR, "medical_book.pdf")
OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json")

os.makedirs(PROCESSED_TEXT_DIR, exist_ok=True)

# ---------------- CHUNKING ----------------
def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"):
    """
    Chunk combined text (text + tables) into smaller parts.
    
    Args:
        pages_data (list): Extracted pages with combined text/tables.
        pdf_path (str): Path to the PDF file.
        chunk_size (int): Size of each chunk (only for recursive).
        overlap (int): Overlap between chunks (only for recursive).
        mode (str): "recursive" or "sentence".
        
    Returns:
        list: List of chunk metadata dictionaries.
    """
    formatted_chunks = []

    if mode == "recursive":
        # Recursive character splitter
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)

        for page in pages_data:
            chunks = splitter.split_text(page["content"])
            for i, chunk in enumerate(chunks):
                formatted_chunks.append({
                    "chunk_id": f"{page['page_num']}_{i}",
                    "page_num": page["page_num"],
                    "content": chunk.strip(),
                    "pdf_file": os.path.basename(pdf_path),
                    "images": []
                })

    elif mode == "sentence":
        # Sentence splitting with merging short sentences
        for page in pages_data:
            sentences = nltk.sent_tokenize(page["content"])
            buffer = ""
            for i, sentence in enumerate(sentences):
                # Merge short sentences (< 50 chars)
                if len(buffer) + len(sentence) < 50:
                    buffer += " " + sentence
                else:
                    if buffer:
                        formatted_chunks.append({
                            "chunk_id": f"{page['page_num']}_{i}",
                            "page_num": page["page_num"],
                            "content": buffer.strip(),
                            "pdf_file": os.path.basename(pdf_path),
                            "images": []
                        })
                    buffer = sentence
            if buffer:
                formatted_chunks.append({
                    "chunk_id": f"{page['page_num']}_{len(sentences)}",
                    "page_num": page["page_num"],
                    "content": buffer.strip(),
                    "pdf_file": os.path.basename(pdf_path),
                    "images": []
                })

    else:
        raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.")

    return formatted_chunks

# ---------------- MERGE TEXT + IMAGES ----------------
def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map, 
                                        # caption_map
                                        ):
    """
    Add extracted images, page snapshots, and captions to chunks.
    """
    for chunk in chunks:
        page_num = chunk["page_num"]

        # Add inline figure images
        chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])]

        # Add page snapshot
        chunk["page_snapshot"] = page_snapshot_map.get(page_num)

        # Add captions (list of {image_path, caption_text})
        # chunk["captions"] = caption_map.get(page_num, [])

    return chunks

# ---------------- SAVE JSON ----------------
def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH):
    """
    Save the final chunked data with tables + images to JSON.
    """
    print(output_path)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=4)
    print(f"Saved chunks metadata to: {output_path}")

if __name__ == "__main__":
    pdf_path = "Artifacts/raw_pdf/medical_book.pdf"

    combined_pages, logs = extract_text_with_tables(pdf_path)
    text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence")
    image_map = extract_images_pymupdf(pdf_path, IMAGE_PATH)
    # caption_map = extract_images_with_captions(pdf_path)
    page_snapshot_map = extract_full_page_images(pdf_path)

    final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map, 
                                                    #  caption_map
                                                     )
    save_chunks_to_json(final_data, OUTPUT_JSON_PATH)