Spaces:

fierce74
/

Microbiome-Immunotherapy-CDS

Running

App Files Files Community

Microbiome-Immunotherapy-CDS / src /section_generators.py

fierce74

Add application file

7529164 15 days ago

raw

history blame contribute delete

19.8 kB

	"""
	Section generation functions for each report section
	"""

	import logging
	from typing import Dict, Optional, List

	from .models import get_medgemma
	from .rag import RAGRetriever
	from .prompts import build_prompt
	from . import config

	logger = logging.getLogger(__name__)


	class SectionGenerator:
	"""Handles generation of individual report sections"""

	def __init__(self):
	self.llm = get_medgemma()
	self.rag = RAGRetriever()
	self.all_citations = {} # Map citation -> title across all sections

	def generate_preamble(self, patient_data: Dict) -> str:
	"""
	Generate Section 0: Clinical Preamble (auto-populated, no LLM)
	"""
	p = patient_data["patient"]
	c = patient_data["cancer"]
	i = patient_data["immunotherapy"]
	m = patient_data["microbiome"]

	# Format metastases
	metastases_str = ", ".join(c["metastases"]) if c["metastases"] else "none"

	# Determine therapy type
	therapy_type = i.get("therapy_type", "ICI")

	preamble = f"""# Microbiome-Immunotherapy Clinical Report

	Patient ID: {p['id']}
	Age: {p['age']} years
	Gender: {p['gender']}

	## Clinical Context

	Cancer Diagnosis: {c['stage']} {c['type']}"""

	if c.get('subtype'):
	preamble += f" ({c['subtype']})"

	preamble += f"""
	Primary Site: {c['primary_site']}
	Metastases: {metastases_str}
	Diagnosis Date: {c['diagnosis_date']}

	Tumor Biomarkers:
	- PD-L1 Expression: {c['biomarkers']['pdl1_expression']}
	- Tumor Mutational Burden (TMB): {c['biomarkers']['tmb']}
	- Microsatellite Instability (MSI): {c['biomarkers']['msi_status']}

	## Planned Immunotherapy

	Therapy Type: {therapy_type}
	Drug: {i['drug_name']} ({i['drug_class']})
	Treatment Setting: {i['treatment_setting']}
	Line of Therapy: {i['line_of_therapy']}
	Planned Start Date: {i['planned_start_date']}
	"""

	# Add ACT-specific details if present
	if therapy_type == "ACT" and i.get("act_details"):
	act = i["act_details"]
	preamble += f"""
	ACT Details:
	- ACT Type: {act.get('act_type', 'N/A')}
	- Target Antigen: {act.get('target_antigen', 'N/A')}
	- Cell Source: {act.get('cell_source', 'N/A')}
	- Preconditioning Regimen: {act.get('preconditioning_regimen', 'N/A')}
	- T-Cell Harvest Date: {act.get('t_cell_harvest_date', 'N/A')}
	- Expected CRS Risk: {act.get('expected_crs_risk', 'N/A')}
	- Expected Neurotoxicity Risk: {act.get('expected_neurotoxicity_risk', 'N/A')}
	"""

	preamble += f"""
	## Microbiome Profile Overview

	Sample Date: {m['sample_date']}
	Sequencing Method: {m['sequencing_method']}

	This report summarizes gut microbiome findings relevant to anticipated immunotherapy response based on current evidence from peer-reviewed literature.

	---
	"""
	return preamble

	def generate_section_1(self, patient_data: Dict) -> Optional[str]:
	"""
	Generate Section 1: Microbiome Diversity & Composition Profile
	"""
	logger.info("Generating Section 1: Microbiome Diversity & Composition Profile")

	# Retrieve evidence
	chunks = self.rag.retrieve_for_section_1(patient_data)

	if not chunks:
	logger.warning("No evidence retrieved for Section 1, omitting section")
	return None

	# Track citations
	for citation, title in self.rag.get_unique_citation_metadata(chunks):
	self.all_citations[citation] = title

	# Format evidence
	evidence = self.rag.format_chunks_for_llm(chunks)

	# Prepare detected taxa string
	key_bacteria = patient_data["microbiome"]["key_bacteria"]
	detected_taxa_lines = []
	for taxon, abundance in key_bacteria.items():
	if abundance is not None and abundance > 0:
	taxon_display = taxon.replace("_", " ").title()
	detected_taxa_lines.append(f"- {taxon_display}: {abundance}%")

	detected_taxa_str = "\n".join(detected_taxa_lines) if detected_taxa_lines else "None detected above threshold"

	# Build prompt
	diversity = patient_data["microbiome"]["diversity"]
	prompt = build_prompt(
	"section_1",
	patient_data,
	evidence,
	cancer_stage=patient_data["cancer"]["stage"],
	shannon_index=diversity["shannon_index"],
	simpson_index=diversity["simpson_index"],
	observed_species=diversity["observed_species"],
	detected_taxa=detected_taxa_str,
	)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_1"])

	return f"## 1. Microbiome Diversity & Composition Profile\n\n{content}\n\n"

	def generate_section_2(self, patient_data: Dict) -> Optional[str]:
	"""
	Generate Section 2: Metabolite Landscape
	"""
	logger.info("Generating Section 2: Metabolite Landscape")

	# Check if metabolite data is available
	metabolites = patient_data["microbiome"]["metabolites"]
	has_scfa = any(v is not None for v in metabolites["scfa"].values())
	has_metabolites = has_scfa or metabolites["bile_acids_available"] or metabolites["tryptophan_metabolites_available"]

	if not has_metabolites:
	logger.info("No metabolite data available, omitting Section 2")
	return None

	# Retrieve evidence
	chunks = self.rag.retrieve_for_section_2(patient_data)

	if not chunks:
	logger.warning("No evidence retrieved for Section 2, omitting section")
	return None

	# Track citations
	for citation, title in self.rag.get_unique_citation_metadata(chunks):
	self.all_citations[citation] = title

	# Format evidence
	evidence = self.rag.format_chunks_for_llm(chunks)

	# Prepare metabolite data string
	metabolite_lines = []

	if has_scfa:
	metabolite_lines.append("Short-Chain Fatty Acids:")
	scfa = metabolites["scfa"]
	if scfa["butyrate_uM"] is not None:
	metabolite_lines.append(f"- Butyrate: {scfa['butyrate_uM']} μM")
	if scfa["propionate_uM"] is not None:
	metabolite_lines.append(f"- Propionate: {scfa['propionate_uM']} μM")
	if scfa["acetate_uM"] is not None:
	metabolite_lines.append(f"- Acetate: {scfa['acetate_uM']} μM")

	if metabolites["bile_acids_available"]:
	metabolite_lines.append("Bile Acids: Analysis available")

	if metabolites["tryptophan_metabolites_available"]:
	metabolite_lines.append("Tryptophan Metabolites: Analysis available")

	metabolite_data_str = "\n".join(metabolite_lines)

	# Build prompt
	prompt = build_prompt(
	"section_2",
	patient_data,
	evidence,
	metabolite_data=metabolite_data_str,
	)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_2"])

	return f"## 2. Metabolite Landscape\n\n{content}\n\n"

	def generate_section_3(self, patient_data: Dict) -> Optional[str]:
	"""
	Generate Section 3: Drug–Microbiome Interaction Outlook
	Supports both ICI and ACT therapies
	"""
	logger.info("Generating Section 3: Drug–Microbiome Interaction Outlook")

	# Retrieve evidence
	chunks = self.rag.retrieve_for_section_3(patient_data)

	if not chunks:
	logger.warning("No evidence retrieved for Section 3, omitting section")
	return None

	# Track citations
	for citation, title in self.rag.get_unique_citation_metadata(chunks):
	self.all_citations[citation] = title

	# Format evidence
	evidence = self.rag.format_chunks_for_llm(chunks)

	# Prepare summaries
	diversity = patient_data["microbiome"]["diversity"]
	key_bacteria = patient_data["microbiome"]["key_bacteria"]

	# Key taxa summary
	detected_taxa = [
	(k.replace("_", " ").title(), v)
	for k, v in key_bacteria.items()
	if v is not None and v > 0
	]
	detected_taxa.sort(key=lambda x: x[1], reverse=True)
	key_taxa_summary = ", ".join([f"{t} ({a}%)" for t, a in detected_taxa[:5]])

	# Metabolite summary
	metabolites = patient_data["microbiome"]["metabolites"]
	metabolite_flags = []
	if any(v is not None for v in metabolites["scfa"].values()):
	metabolite_flags.append("SCFAs measured")
	if metabolites["bile_acids_available"]:
	metabolite_flags.append("bile acids available")
	if metabolites["tryptophan_metabolites_available"]:
	metabolite_flags.append("tryptophan metabolites available")
	metabolite_summary = ", ".join(metabolite_flags) if metabolite_flags else "limited metabolite data"

	# Determine therapy type
	therapy_type = patient_data["immunotherapy"].get("therapy_type", "ICI")

	# Build prompt based on therapy type
	if therapy_type == "ACT":
	act_details = patient_data["immunotherapy"].get("act_details", {})
	prompt = build_prompt(
	"section_3",
	patient_data,
	evidence,
	cancer_stage=patient_data["cancer"]["stage"],
	act_type=act_details.get("act_type", "CAR-T"),
	target_antigen=act_details.get("target_antigen", "CD19"),
	cell_source=act_details.get("cell_source", "autologous"),
	crs_risk=act_details.get("expected_crs_risk", "unknown"),
	neurotoxicity_risk=act_details.get("expected_neurotoxicity_risk", "unknown"),
	line_of_therapy=patient_data["immunotherapy"]["line_of_therapy"],
	shannon_index=diversity["shannon_index"],
	simpson_index=diversity["simpson_index"],
	key_taxa_summary=key_taxa_summary,
	metabolite_summary=metabolite_summary,
	)
	else: # ICI
	biomarkers = patient_data["cancer"]["biomarkers"]
	prompt = build_prompt(
	"section_3",
	patient_data,
	evidence,
	cancer_stage=patient_data["cancer"]["stage"],
	pdl1=biomarkers["pdl1_expression"],
	tmb=biomarkers["tmb"],
	msi=biomarkers["msi_status"],
	line_of_therapy=patient_data["immunotherapy"]["line_of_therapy"],
	shannon_index=diversity["shannon_index"],
	simpson_index=diversity["simpson_index"],
	key_taxa_summary=key_taxa_summary,
	metabolite_summary=metabolite_summary,
	)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_3"]) # Longer for this section

	# Section title varies by therapy type
	if therapy_type == "ACT":
	section_title = "3. Microbiome–ACT Interaction Outlook"
	else:
	section_title = "3. Drug–Microbiome Interaction Outlook"

	return f"## {section_title}\n\n{content}\n\n"

	def generate_section_4(self, patient_data: Dict) -> Optional[str]:
	"""
	Generate Section 4: Confounding Factors
	"""
	logger.info("Generating Section 4: Confounding Factors")

	# Check if any confounding factors are present
	meds = patient_data["medications"]
	prior = patient_data["prior_treatments"]

	has_confounders = (
	meds["antibiotic_history"]["recent_antibiotics"] or
	meds["ppi_use"]["currently_on_ppi"] or
	prior["chemotherapy"]["received"] or
	prior["prior_immunotherapy"]["received"] or
	len(patient_data.get("comorbidities", [])) > 0
	)

	if not has_confounders:
	logger.info("No confounding factors present, omitting Section 4")
	return None

	# Retrieve evidence
	chunks = self.rag.retrieve_for_section_4(patient_data)

	if not chunks:
	logger.warning("No evidence retrieved for Section 4, omitting section")
	return None

	# Track citations
	for citation, title in self.rag.get_unique_citation_metadata(chunks):
	self.all_citations[citation] = title

	# Format evidence
	evidence = self.rag.format_chunks_for_llm(chunks)

	# Prepare confounding data string
	confounding_lines = []

	# Antibiotic history
	if meds["antibiotic_history"]["recent_antibiotics"]:
	confounding_lines.append("Antibiotic Exposure:")
	for exp in meds["antibiotic_history"]["exposures"]:
	confounding_lines.append(
	f"- {exp['antibiotic_name']} ({exp['antibiotic_class']}): "
	f"{exp['start_date']} to {exp['end_date']} "
	f"({exp['days_before_ici']} days before ICI start)"
	)

	# PPI use
	if meds["ppi_use"]["currently_on_ppi"]:
	ppi = meds["ppi_use"]
	confounding_lines.append(f"Proton Pump Inhibitor Use:")
	confounding_lines.append(f"- {ppi['ppi_name']}, duration: {ppi['duration_months']} months")

	# Prior chemotherapy
	if prior["chemotherapy"]["received"]:
	chemo = prior["chemotherapy"]
	regimens_str = ", ".join(chemo["regimens"])
	confounding_lines.append(f"Prior Chemotherapy:")
	confounding_lines.append(f"- Regimens: {regimens_str}")
	confounding_lines.append(f"- Response: {chemo['response']}")

	# Prior immunotherapy
	if prior["prior_immunotherapy"]["received"]:
	prior_ici = prior["prior_immunotherapy"]
	drugs_str = ", ".join(prior_ici["drugs"])
	confounding_lines.append(f"Prior Immunotherapy:")
	confounding_lines.append(f"- Drugs: {drugs_str}")
	confounding_lines.append(f"- Response: {prior_ici['response']}")

	# Comorbidities
	if patient_data.get("comorbidities"):
	comorbidities_str = ", ".join(patient_data["comorbidities"])
	confounding_lines.append(f"Comorbidities: {comorbidities_str}")

	confounding_data_str = "\n".join(confounding_lines)

	# Build prompt
	prompt = build_prompt(
	"section_4",
	patient_data,
	evidence,
	confounding_data=confounding_data_str,
	)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_4"])

	return f"## 4. Confounding Factors\n\n{content}\n\n"

	def generate_section_5(self, patient_data: Dict) -> Optional[str]:
	"""
	Generate Section 5: Microbiota-Modulation Intervention Considerations
	"""
	logger.info("Generating Section 5: Microbiota-Modulation Intervention Considerations")

	# Retrieve evidence for each intervention type
	intervention_chunks = self.rag.retrieve_for_section_5(patient_data)

	# Check if any intervention evidence was retrieved
	total_chunks = sum(len(chunks) for chunks in intervention_chunks.values())
	if total_chunks == 0:
	logger.warning("No intervention evidence retrieved for Section 5, omitting section")
	return None

	# Track citations from all intervention types
	for chunks in intervention_chunks.values():
	for citation, title in self.rag.get_unique_citation_metadata(chunks):
	self.all_citations[citation] = title

	# Format evidence for each intervention type
	evidence_str = "## Diet and Prebiotics Evidence\n\n"
	evidence_str += self.rag.format_chunks_for_llm(intervention_chunks.get("diet", []))
	evidence_str += "\n\n## Probiotics Evidence\n\n"
	evidence_str += self.rag.format_chunks_for_llm(intervention_chunks.get("probiotics", []))

	# Prepare microbiome summary
	key_bacteria = patient_data["microbiome"]["key_bacteria"]
	detected_taxa = [
	k.replace("_", " ").title()
	for k, v in key_bacteria.items()
	if v is not None and v > 0
	]
	microbiome_summary = f"Detected taxa: {', '.join(detected_taxa[:5])}"

	# Build prompt
	prompt = build_prompt(
	"section_5",
	patient_data,
	evidence_str,
	microbiome_summary=microbiome_summary,
	)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_5"])

	return f"## 5. Microbiota-Modulation Intervention Considerations\n\n{content}\n\n"

	def generate_section_6(self, patient_data: Dict) -> str:
	"""
	Generate Section 6: Data Quality & Interpretive Limitations
	"""
	logger.info("Generating Section 6: Data Quality & Interpretive Limitations")

	# This section doesn't use RAG, just data quality fields
	data_quality = patient_data["microbiome"]["data_quality"]
	metabolites = patient_data["microbiome"]["metabolites"]

	# Prepare data quality string
	dq_lines = [
	f"Sequencing Method: {patient_data['microbiome']['sequencing_method']}",
	f"Data Completeness: {data_quality['completeness']}",
	f"Data Source: {data_quality['source']}",
	]

	if data_quality.get("limitations"):
	dq_lines.append(f"Limitations: {', '.join(data_quality['limitations'])}")

	# Note missing metabolite data
	missing_metabolites = []
	if not any(v is not None for v in metabolites["scfa"].values()):
	missing_metabolites.append("Short-chain fatty acids")
	if not metabolites["bile_acids_available"]:
	missing_metabolites.append("Bile acids")
	if not metabolites["tryptophan_metabolites_available"]:
	missing_metabolites.append("Tryptophan metabolites")

	if missing_metabolites:
	dq_lines.append(f"Missing Metabolite Data: {', '.join(missing_metabolites)}")

	data_quality_str = "\n".join(dq_lines)

	# Build prompt (no RAG evidence needed)
	from .prompts import SECTION_6_PROMPT
	prompt = SECTION_6_PROMPT.format(data_quality=data_quality_str)

	# Generate
	content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_6"])

	from .prompts import SECTION_6_FIXED_CAVEATS
	full_content = f"{SECTION_6_FIXED_CAVEATS}\n\n{content}"
	return f"## 6. Data Quality & Interpretive Limitations\n\n{full_content}\n\n"

	def get_all_citations(self) -> List[tuple]:
	"""Return sorted list of all unique (citation, title) tuples used in the report"""
	return sorted(list(self.all_citations.items()))