Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import nltk | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from pdf_utils import ( | |
| extract_text_with_tables, | |
| extract_images_pymupdf, | |
| extract_images_with_captions, | |
| extract_full_page_images | |
| ) | |
| # Get project root dynamically (3 levels up from current file) | |
| BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) | |
| DATA_DIR = os.path.join(BASE_DIR, "Artifacts") | |
| RAW_PDF_DIR = os.path.join(DATA_DIR, "raw_pdf") | |
| PROCESSED_TEXT_DIR = os.path.join(DATA_DIR, "processed_text") | |
| EMBEDDINGS_DIR = os.path.join(DATA_DIR, "embeddings") | |
| PAGE_IMAGES_DIR = os.path.join(DATA_DIR, "page_images") | |
| IMAGE_PATH = os.path.join(DATA_DIR, "images") | |
| IMAGE_CAPTIONS_DIR = os.path.join(DATA_DIR, "image_with_captions") | |
| pdf_path = os.path.join(RAW_PDF_DIR, "medical_book.pdf") | |
| OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json") | |
| os.makedirs(PROCESSED_TEXT_DIR, exist_ok=True) | |
| # ---------------- CHUNKING ---------------- | |
| def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"): | |
| """ | |
| Chunk combined text (text + tables) into smaller parts. | |
| Args: | |
| pages_data (list): Extracted pages with combined text/tables. | |
| pdf_path (str): Path to the PDF file. | |
| chunk_size (int): Size of each chunk (only for recursive). | |
| overlap (int): Overlap between chunks (only for recursive). | |
| mode (str): "recursive" or "sentence". | |
| Returns: | |
| list: List of chunk metadata dictionaries. | |
| """ | |
| formatted_chunks = [] | |
| if mode == "recursive": | |
| # Recursive character splitter | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
| for page in pages_data: | |
| chunks = splitter.split_text(page["content"]) | |
| for i, chunk in enumerate(chunks): | |
| formatted_chunks.append({ | |
| "chunk_id": f"{page['page_num']}_{i}", | |
| "page_num": page["page_num"], | |
| "content": chunk.strip(), | |
| "pdf_file": os.path.basename(pdf_path), | |
| "images": [] | |
| }) | |
| elif mode == "sentence": | |
| # Sentence splitting with merging short sentences | |
| for page in pages_data: | |
| sentences = nltk.sent_tokenize(page["content"]) | |
| buffer = "" | |
| for i, sentence in enumerate(sentences): | |
| # Merge short sentences (< 50 chars) | |
| if len(buffer) + len(sentence) < 50: | |
| buffer += " " + sentence | |
| else: | |
| if buffer: | |
| formatted_chunks.append({ | |
| "chunk_id": f"{page['page_num']}_{i}", | |
| "page_num": page["page_num"], | |
| "content": buffer.strip(), | |
| "pdf_file": os.path.basename(pdf_path), | |
| "images": [] | |
| }) | |
| buffer = sentence | |
| if buffer: | |
| formatted_chunks.append({ | |
| "chunk_id": f"{page['page_num']}_{len(sentences)}", | |
| "page_num": page["page_num"], | |
| "content": buffer.strip(), | |
| "pdf_file": os.path.basename(pdf_path), | |
| "images": [] | |
| }) | |
| else: | |
| raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.") | |
| return formatted_chunks | |
| # ---------------- MERGE TEXT + IMAGES ---------------- | |
| def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map, | |
| # caption_map | |
| ): | |
| """ | |
| Add extracted images, page snapshots, and captions to chunks. | |
| """ | |
| for chunk in chunks: | |
| page_num = chunk["page_num"] | |
| # Add inline figure images | |
| chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])] | |
| # Add page snapshot | |
| chunk["page_snapshot"] = page_snapshot_map.get(page_num) | |
| # Add captions (list of {image_path, caption_text}) | |
| # chunk["captions"] = caption_map.get(page_num, []) | |
| return chunks | |
| # ---------------- SAVE JSON ---------------- | |
| def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH): | |
| """ | |
| Save the final chunked data with tables + images to JSON. | |
| """ | |
| print(output_path) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(final_data, f, ensure_ascii=False, indent=4) | |
| print(f"Saved chunks metadata to: {output_path}") | |
| if __name__ == "__main__": | |
| pdf_path = "Artifacts/raw_pdf/medical_book.pdf" | |
| combined_pages, logs = extract_text_with_tables(pdf_path) | |
| text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence") | |
| image_map = extract_images_pymupdf(pdf_path, IMAGE_PATH) | |
| # caption_map = extract_images_with_captions(pdf_path) | |
| page_snapshot_map = extract_full_page_images(pdf_path) | |
| final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map, | |
| # caption_map | |
| ) | |
| save_chunks_to_json(final_data, OUTPUT_JSON_PATH) | |