Spaces:
Sleeping
Sleeping
File size: 5,339 Bytes
1713970 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import os
import json
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pdf_utils import (
extract_text_with_tables,
extract_images_pymupdf,
extract_images_with_captions,
extract_full_page_images
)
# Get project root dynamically (3 levels up from current file)
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
DATA_DIR = os.path.join(BASE_DIR, "Artifacts")
RAW_PDF_DIR = os.path.join(DATA_DIR, "raw_pdf")
PROCESSED_TEXT_DIR = os.path.join(DATA_DIR, "processed_text")
EMBEDDINGS_DIR = os.path.join(DATA_DIR, "embeddings")
PAGE_IMAGES_DIR = os.path.join(DATA_DIR, "page_images")
IMAGE_PATH = os.path.join(DATA_DIR, "images")
IMAGE_CAPTIONS_DIR = os.path.join(DATA_DIR, "image_with_captions")
pdf_path = os.path.join(RAW_PDF_DIR, "medical_book.pdf")
OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json")
os.makedirs(PROCESSED_TEXT_DIR, exist_ok=True)
# ---------------- CHUNKING ----------------
def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"):
"""
Chunk combined text (text + tables) into smaller parts.
Args:
pages_data (list): Extracted pages with combined text/tables.
pdf_path (str): Path to the PDF file.
chunk_size (int): Size of each chunk (only for recursive).
overlap (int): Overlap between chunks (only for recursive).
mode (str): "recursive" or "sentence".
Returns:
list: List of chunk metadata dictionaries.
"""
formatted_chunks = []
if mode == "recursive":
# Recursive character splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
for page in pages_data:
chunks = splitter.split_text(page["content"])
for i, chunk in enumerate(chunks):
formatted_chunks.append({
"chunk_id": f"{page['page_num']}_{i}",
"page_num": page["page_num"],
"content": chunk.strip(),
"pdf_file": os.path.basename(pdf_path),
"images": []
})
elif mode == "sentence":
# Sentence splitting with merging short sentences
for page in pages_data:
sentences = nltk.sent_tokenize(page["content"])
buffer = ""
for i, sentence in enumerate(sentences):
# Merge short sentences (< 50 chars)
if len(buffer) + len(sentence) < 50:
buffer += " " + sentence
else:
if buffer:
formatted_chunks.append({
"chunk_id": f"{page['page_num']}_{i}",
"page_num": page["page_num"],
"content": buffer.strip(),
"pdf_file": os.path.basename(pdf_path),
"images": []
})
buffer = sentence
if buffer:
formatted_chunks.append({
"chunk_id": f"{page['page_num']}_{len(sentences)}",
"page_num": page["page_num"],
"content": buffer.strip(),
"pdf_file": os.path.basename(pdf_path),
"images": []
})
else:
raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.")
return formatted_chunks
# ---------------- MERGE TEXT + IMAGES ----------------
def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map,
# caption_map
):
"""
Add extracted images, page snapshots, and captions to chunks.
"""
for chunk in chunks:
page_num = chunk["page_num"]
# Add inline figure images
chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])]
# Add page snapshot
chunk["page_snapshot"] = page_snapshot_map.get(page_num)
# Add captions (list of {image_path, caption_text})
# chunk["captions"] = caption_map.get(page_num, [])
return chunks
# ---------------- SAVE JSON ----------------
def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH):
"""
Save the final chunked data with tables + images to JSON.
"""
print(output_path)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(final_data, f, ensure_ascii=False, indent=4)
print(f"Saved chunks metadata to: {output_path}")
if __name__ == "__main__":
pdf_path = "Artifacts/raw_pdf/medical_book.pdf"
combined_pages, logs = extract_text_with_tables(pdf_path)
text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence")
image_map = extract_images_pymupdf(pdf_path, IMAGE_PATH)
# caption_map = extract_images_with_captions(pdf_path)
page_snapshot_map = extract_full_page_images(pdf_path)
final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map,
# caption_map
)
save_chunks_to_json(final_data, OUTPUT_JSON_PATH)
|