Buckets:

lablab-ai-amd-developer-hackathon
/

OncoAgent

Files

xet

lablab-ai-amd-developer-hackathon/OncoAgent / rag_engine /advanced_ingestion.py

MaximoLopezChenlo

about 2 months ago

download

raw

5.3 kB

	import os
	import json
	import re
	import pymupdf4llm
	import networkx as nx
	import logging
	from typing import List, Dict, Optional

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class AdvancedOncoIngestor:
	"""
	Advanced Ingestor for SOTA RAG.
	- Uses pymupdf4llm for Markdown table preservation.
	- Builds a basic Knowledge Graph (GraphRAG) using NetworkX.
	"""

	def __init__(self, output_dir: str = "data/processed/sota_chunks", graph_path: str = "data/processed/knowledge_graph.gml"):
	self.output_dir = output_dir
	self.graph_path = graph_path
	self.graph = nx.Graph()

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Keywords for entity extraction (Basic regex-based GraphRAG)
	self.drugs = ["pembrolizumab", "nivolumab", "erlotinib", "afatinib", "osimertinib", "gefitinib", "alectinib", "brigatinib", "lorlatinib", "trastuzumab", "pertuzumab", "lapatinib", "neratinib", "t-dm1", "paclitaxel", "docetaxel", "carboplatin", "cisplatin", "gemcitabine", "pemetrexed", "bevacizumab", "ramucirumab", "atezolizumab", "durvalumab"]
	self.mutations = ["egfr", "alk", "ros1", "braf", "kras", "nras", "her2", "pd-l1", "msi-h", "dmmr", "pik3ca", "esr1", "brca1", "brca2", "ret", "met", "ntrk"]
	self.conditions = ["nsclc", "sclc", "breast cancer", "colon cancer", "rectal cancer", "melanoma", "adenocarcinoma", "squamous cell carcinoma"]

	def extract_and_graph(self, pdf_path: str):
	"""
	Converts PDF to Markdown and updates the Knowledge Graph.
	"""
	filename = os.path.basename(pdf_path)
	logger.info(f"⏳ Processing {filename} with SOTA Markdown extraction...")

	# 1. Convert PDF to Markdown (preserves tables!)
	md_text = pymupdf4llm.to_markdown(pdf_path)

	# 2. Simple Semantic Chunking (split by headers)
	# We look for # or ## headers in markdown
	chunks = []
	current_chunk = []
	current_header = "Intro"

	for line in md_text.split("\n"):
	if line.startswith("#"):
	if current_chunk:
	content = "\n".join(current_chunk)
	chunks.append({
	"header": current_header,
	"content": content,
	"source": filename
	})
	self._update_graph(content, filename)
	current_header = line.strip("# ").strip()
	current_chunk = []
	else:
	current_chunk.append(line)

	# Save last chunk
	if current_chunk:
	content = "\n".join(current_chunk)
	chunks.append({
	"header": current_header,
	"content": content,
	"source": filename
	})
	self._update_graph(content, filename)

	# 3. Save chunks
	output_path = os.path.join(self.output_dir, f"{filename.replace('.pdf', '')}.json")
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(chunks, f, ensure_ascii=False, indent=4)

	logger.info(f"✅ Saved {len(chunks)} Markdown chunks for {filename}")

	def _update_graph(self, text: str, source: str):
	"""
	Updates the NetworkX graph by extracting clinical entities.
	"""
	text_lower = text.lower()

	found_drugs = [d for d in self.drugs if d in text_lower]
	found_mutations = [m for m in self.mutations if m in text_lower]
	found_conditions = [c for c in self.conditions if c in text_lower]

	# Add nodes and edges
	for d in found_drugs:
	self.graph.add_node(d, type="drug")
	for m in found_mutations:
	self.graph.add_node(m, type="mutation")
	self.graph.add_edge(d, m, relation="targets", source=source)
	for c in found_conditions:
	self.graph.add_node(c, type="condition")
	self.graph.add_edge(d, c, relation="treats", source=source)

	for m in found_mutations:
	for c in found_conditions:
	self.graph.add_edge(m, c, relation="associated_with", source=source)

	def save_graph(self):
	"""
	Saves the graph to disk.
	"""
	# Save as GML for better compatibility with graph tools, or JSON for simplicity
	nx.write_gml(self.graph, self.graph_path)
	logger.info(f"🕸️ Knowledge Graph saved to {self.graph_path} ({len(self.graph.nodes)} nodes, {len(self.graph.edges)} edges)")

	if __name__ == "__main__":
	ingestor = AdvancedOncoIngestor()

	guides_dir = "data/clinical_guides"
	target_files = ["nscl.pdf", "breast.pdf", "colon.pdf", "hcc.pdf"]

	if os.path.exists(guides_dir):
	for root, dirs, files in os.walk(guides_dir):
	for file in files:
	if file in target_files and "patient" not in file.lower():
	pdf_path = os.path.join(root, file)
	ingestor.extract_and_graph(pdf_path)

	ingestor.save_graph()
	else:
	logger.warning(f"Directory {guides_dir} not found.")

Xet Storage Details

Size:: 5.3 kB
Xet hash:: 9720f4744cb99a398f5e0266b1fae6a90c74fa1ff1588895742598624eea84f5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.