Spaces:

MohitG012
/

Medical_Bot_Agentic_AI

Sleeping

Medical_Bot_Agentic_AI / Src /rag /preprocess.py

MohitGupta41

Initial Commit

1713970 6 months ago

5.34 kB

	import os
	import json
	import nltk
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from pdf_utils import (
	extract_text_with_tables,
	extract_images_pymupdf,
	extract_images_with_captions,
	extract_full_page_images
	)

	# Get project root dynamically (3 levels up from current file)
	BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))

	DATA_DIR = os.path.join(BASE_DIR, "Artifacts")
	RAW_PDF_DIR = os.path.join(DATA_DIR, "raw_pdf")
	PROCESSED_TEXT_DIR = os.path.join(DATA_DIR, "processed_text")
	EMBEDDINGS_DIR = os.path.join(DATA_DIR, "embeddings")
	PAGE_IMAGES_DIR = os.path.join(DATA_DIR, "page_images")
	IMAGE_PATH = os.path.join(DATA_DIR, "images")
	IMAGE_CAPTIONS_DIR = os.path.join(DATA_DIR, "image_with_captions")
	pdf_path = os.path.join(RAW_PDF_DIR, "medical_book.pdf")
	OUTPUT_JSON_PATH = os.path.join(PROCESSED_TEXT_DIR, "chunks_metadata.json")

	os.makedirs(PROCESSED_TEXT_DIR, exist_ok=True)

	# ---------------- CHUNKING ----------------
	def chunk_combined_content(pages_data, pdf_path, chunk_size=800, overlap=50, mode="recursive"):
	"""
	Chunk combined text (text + tables) into smaller parts.

	Args:
	pages_data (list): Extracted pages with combined text/tables.
	pdf_path (str): Path to the PDF file.
	chunk_size (int): Size of each chunk (only for recursive).
	overlap (int): Overlap between chunks (only for recursive).
	mode (str): "recursive" or "sentence".

	Returns:
	list: List of chunk metadata dictionaries.
	"""
	formatted_chunks = []

	if mode == "recursive":
	# Recursive character splitter
	splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)

	for page in pages_data:
	chunks = splitter.split_text(page["content"])
	for i, chunk in enumerate(chunks):
	formatted_chunks.append({
	"chunk_id": f"{page['page_num']}_{i}",
	"page_num": page["page_num"],
	"content": chunk.strip(),
	"pdf_file": os.path.basename(pdf_path),
	"images": []
	})

	elif mode == "sentence":
	# Sentence splitting with merging short sentences
	for page in pages_data:
	sentences = nltk.sent_tokenize(page["content"])
	buffer = ""
	for i, sentence in enumerate(sentences):
	# Merge short sentences (< 50 chars)
	if len(buffer) + len(sentence) < 50:
	buffer += " " + sentence
	else:
	if buffer:
	formatted_chunks.append({
	"chunk_id": f"{page['page_num']}_{i}",
	"page_num": page["page_num"],
	"content": buffer.strip(),
	"pdf_file": os.path.basename(pdf_path),
	"images": []
	})
	buffer = sentence
	if buffer:
	formatted_chunks.append({
	"chunk_id": f"{page['page_num']}_{len(sentences)}",
	"page_num": page["page_num"],
	"content": buffer.strip(),
	"pdf_file": os.path.basename(pdf_path),
	"images": []
	})

	else:
	raise ValueError("Invalid mode. Use 'recursive' or 'sentence'.")

	return formatted_chunks

	# ---------------- MERGE TEXT + IMAGES ----------------
	def merge_text_and_images_with_captions(chunks, image_map, page_snapshot_map,
	# caption_map
	):
	"""
	Add extracted images, page snapshots, and captions to chunks.
	"""
	for chunk in chunks:
	page_num = chunk["page_num"]

	# Add inline figure images
	chunk["images"] = [path.replace("\\", "/") for path in image_map.get(page_num, [])]

	# Add page snapshot
	chunk["page_snapshot"] = page_snapshot_map.get(page_num)

	# Add captions (list of {image_path, caption_text})
	# chunk["captions"] = caption_map.get(page_num, [])

	return chunks

	# ---------------- SAVE JSON ----------------
	def save_chunks_to_json(final_data, output_path=OUTPUT_JSON_PATH):
	"""
	Save the final chunked data with tables + images to JSON.
	"""
	print(output_path)
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(final_data, f, ensure_ascii=False, indent=4)
	print(f"Saved chunks metadata to: {output_path}")

	if __name__ == "__main__":
	pdf_path = "Artifacts/raw_pdf/medical_book.pdf"

	combined_pages, logs = extract_text_with_tables(pdf_path)
	text_chunks = chunk_combined_content(combined_pages, pdf_path, chunk_size=600, mode="sentence")
	image_map = extract_images_pymupdf(pdf_path, IMAGE_PATH)
	# caption_map = extract_images_with_captions(pdf_path)
	page_snapshot_map = extract_full_page_images(pdf_path)

	final_data = merge_text_and_images_with_captions(text_chunks, image_map, page_snapshot_map,
	# caption_map
	)
	save_chunks_to_json(final_data, OUTPUT_JSON_PATH)