Spaces:

fierce74
/

Microbiome-Immunotherapy-CDS

Running

File size: 19,773 Bytes
"""
Section generation functions for each report section
"""

import logging
from typing import Dict, Optional, List

from .models import get_medgemma
from .rag import RAGRetriever
from .prompts import build_prompt
from . import config

logger = logging.getLogger(__name__)


class SectionGenerator:
    """Handles generation of individual report sections"""
    
    def __init__(self):
        self.llm = get_medgemma()
        self.rag = RAGRetriever()
        self.all_citations = {}  # Map citation -> title across all sections
    
    def generate_preamble(self, patient_data: Dict) -> str:
        """
        Generate Section 0: Clinical Preamble (auto-populated, no LLM)
        """
        p = patient_data["patient"]
        c = patient_data["cancer"]
        i = patient_data["immunotherapy"]
        m = patient_data["microbiome"]
        
        # Format metastases
        metastases_str = ", ".join(c["metastases"]) if c["metastases"] else "none"
        
        # Determine therapy type
        therapy_type = i.get("therapy_type", "ICI")
        
        preamble = f"""# Microbiome-Immunotherapy Clinical Report

**Patient ID:** {p['id']}  
**Age:** {p['age']} years  
**Gender:** {p['gender']}

## Clinical Context

**Cancer Diagnosis:** {c['stage']} {c['type']}"""
        
        if c.get('subtype'):
            preamble += f" ({c['subtype']})"
        
        preamble += f"""  
**Primary Site:** {c['primary_site']}  
**Metastases:** {metastases_str}  
**Diagnosis Date:** {c['diagnosis_date']}

**Tumor Biomarkers:**
- PD-L1 Expression: {c['biomarkers']['pdl1_expression']}
- Tumor Mutational Burden (TMB): {c['biomarkers']['tmb']}
- Microsatellite Instability (MSI): {c['biomarkers']['msi_status']}

## Planned Immunotherapy

**Therapy Type:** {therapy_type}  
**Drug:** {i['drug_name']} ({i['drug_class']})  
**Treatment Setting:** {i['treatment_setting']}  
**Line of Therapy:** {i['line_of_therapy']}  
**Planned Start Date:** {i['planned_start_date']}
"""
        
        # Add ACT-specific details if present
        if therapy_type == "ACT" and i.get("act_details"):
            act = i["act_details"]
            preamble += f"""
**ACT Details:**
- ACT Type: {act.get('act_type', 'N/A')}
- Target Antigen: {act.get('target_antigen', 'N/A')}
- Cell Source: {act.get('cell_source', 'N/A')}
- Preconditioning Regimen: {act.get('preconditioning_regimen', 'N/A')}
- T-Cell Harvest Date: {act.get('t_cell_harvest_date', 'N/A')}
- Expected CRS Risk: {act.get('expected_crs_risk', 'N/A')}
- Expected Neurotoxicity Risk: {act.get('expected_neurotoxicity_risk', 'N/A')}
"""
        
        preamble += f"""
## Microbiome Profile Overview

**Sample Date:** {m['sample_date']}  
**Sequencing Method:** {m['sequencing_method']}

This report summarizes gut microbiome findings relevant to anticipated immunotherapy response based on current evidence from peer-reviewed literature.

---
"""
        return preamble
    
    def generate_section_1(self, patient_data: Dict) -> Optional[str]:
        """
        Generate Section 1: Microbiome Diversity & Composition Profile
        """
        logger.info("Generating Section 1: Microbiome Diversity & Composition Profile")
        
        # Retrieve evidence
        chunks = self.rag.retrieve_for_section_1(patient_data)
        
        if not chunks:
            logger.warning("No evidence retrieved for Section 1, omitting section")
            return None
        
        # Track citations
        for citation, title in self.rag.get_unique_citation_metadata(chunks):
            self.all_citations[citation] = title
        
        # Format evidence
        evidence = self.rag.format_chunks_for_llm(chunks)
        
        # Prepare detected taxa string
        key_bacteria = patient_data["microbiome"]["key_bacteria"]
        detected_taxa_lines = []
        for taxon, abundance in key_bacteria.items():
            if abundance is not None and abundance > 0:
                taxon_display = taxon.replace("_", " ").title()
                detected_taxa_lines.append(f"- {taxon_display}: {abundance}%")
        
        detected_taxa_str = "\n".join(detected_taxa_lines) if detected_taxa_lines else "None detected above threshold"
        
        # Build prompt
        diversity = patient_data["microbiome"]["diversity"]
        prompt = build_prompt(
            "section_1",
            patient_data,
            evidence,
            cancer_stage=patient_data["cancer"]["stage"],
            shannon_index=diversity["shannon_index"],
            simpson_index=diversity["simpson_index"],
            observed_species=diversity["observed_species"],
            detected_taxa=detected_taxa_str,
        )
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_1"])
        
        return f"## 1. Microbiome Diversity & Composition Profile\n\n{content}\n\n"
    
    def generate_section_2(self, patient_data: Dict) -> Optional[str]:
        """
        Generate Section 2: Metabolite Landscape
        """
        logger.info("Generating Section 2: Metabolite Landscape")
        
        # Check if metabolite data is available
        metabolites = patient_data["microbiome"]["metabolites"]
        has_scfa = any(v is not None for v in metabolites["scfa"].values())
        has_metabolites = has_scfa or metabolites["bile_acids_available"] or metabolites["tryptophan_metabolites_available"]
        
        if not has_metabolites:
            logger.info("No metabolite data available, omitting Section 2")
            return None
        
        # Retrieve evidence
        chunks = self.rag.retrieve_for_section_2(patient_data)
        
        if not chunks:
            logger.warning("No evidence retrieved for Section 2, omitting section")
            return None
        
        # Track citations
        for citation, title in self.rag.get_unique_citation_metadata(chunks):
            self.all_citations[citation] = title
        
        # Format evidence
        evidence = self.rag.format_chunks_for_llm(chunks)
        
        # Prepare metabolite data string
        metabolite_lines = []
        
        if has_scfa:
            metabolite_lines.append("**Short-Chain Fatty Acids:**")
            scfa = metabolites["scfa"]
            if scfa["butyrate_uM"] is not None:
                metabolite_lines.append(f"- Butyrate: {scfa['butyrate_uM']} μM")
            if scfa["propionate_uM"] is not None:
                metabolite_lines.append(f"- Propionate: {scfa['propionate_uM']} μM")
            if scfa["acetate_uM"] is not None:
                metabolite_lines.append(f"- Acetate: {scfa['acetate_uM']} μM")
        
        if metabolites["bile_acids_available"]:
            metabolite_lines.append("**Bile Acids:** Analysis available")
        
        if metabolites["tryptophan_metabolites_available"]:
            metabolite_lines.append("**Tryptophan Metabolites:** Analysis available")
        
        metabolite_data_str = "\n".join(metabolite_lines)
        
        # Build prompt
        prompt = build_prompt(
            "section_2",
            patient_data,
            evidence,
            metabolite_data=metabolite_data_str,
        )
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_2"])
        
        return f"## 2. Metabolite Landscape\n\n{content}\n\n"
    
    def generate_section_3(self, patient_data: Dict) -> Optional[str]:
        """
        Generate Section 3: Drug–Microbiome Interaction Outlook
        Supports both ICI and ACT therapies
        """
        logger.info("Generating Section 3: Drug–Microbiome Interaction Outlook")
        
        # Retrieve evidence
        chunks = self.rag.retrieve_for_section_3(patient_data)
        
        if not chunks:
            logger.warning("No evidence retrieved for Section 3, omitting section")
            return None
        
        # Track citations
        for citation, title in self.rag.get_unique_citation_metadata(chunks):
            self.all_citations[citation] = title
        
        # Format evidence
        evidence = self.rag.format_chunks_for_llm(chunks)
        
        # Prepare summaries
        diversity = patient_data["microbiome"]["diversity"]
        key_bacteria = patient_data["microbiome"]["key_bacteria"]
        
        # Key taxa summary
        detected_taxa = [
            (k.replace("_", " ").title(), v)
            for k, v in key_bacteria.items()
            if v is not None and v > 0
        ]
        detected_taxa.sort(key=lambda x: x[1], reverse=True)
        key_taxa_summary = ", ".join([f"{t} ({a}%)" for t, a in detected_taxa[:5]])
        
        # Metabolite summary
        metabolites = patient_data["microbiome"]["metabolites"]
        metabolite_flags = []
        if any(v is not None for v in metabolites["scfa"].values()):
            metabolite_flags.append("SCFAs measured")
        if metabolites["bile_acids_available"]:
            metabolite_flags.append("bile acids available")
        if metabolites["tryptophan_metabolites_available"]:
            metabolite_flags.append("tryptophan metabolites available")
        metabolite_summary = ", ".join(metabolite_flags) if metabolite_flags else "limited metabolite data"
        
        # Determine therapy type
        therapy_type = patient_data["immunotherapy"].get("therapy_type", "ICI")
        
        # Build prompt based on therapy type
        if therapy_type == "ACT":
            act_details = patient_data["immunotherapy"].get("act_details", {})
            prompt = build_prompt(
                "section_3",
                patient_data,
                evidence,
                cancer_stage=patient_data["cancer"]["stage"],
                act_type=act_details.get("act_type", "CAR-T"),
                target_antigen=act_details.get("target_antigen", "CD19"),
                cell_source=act_details.get("cell_source", "autologous"),
                crs_risk=act_details.get("expected_crs_risk", "unknown"),
                neurotoxicity_risk=act_details.get("expected_neurotoxicity_risk", "unknown"),
                line_of_therapy=patient_data["immunotherapy"]["line_of_therapy"],
                shannon_index=diversity["shannon_index"],
                simpson_index=diversity["simpson_index"],
                key_taxa_summary=key_taxa_summary,
                metabolite_summary=metabolite_summary,
            )
        else:  # ICI
            biomarkers = patient_data["cancer"]["biomarkers"]
            prompt = build_prompt(
                "section_3",
                patient_data,
                evidence,
                cancer_stage=patient_data["cancer"]["stage"],
                pdl1=biomarkers["pdl1_expression"],
                tmb=biomarkers["tmb"],
                msi=biomarkers["msi_status"],
                line_of_therapy=patient_data["immunotherapy"]["line_of_therapy"],
                shannon_index=diversity["shannon_index"],
                simpson_index=diversity["simpson_index"],
                key_taxa_summary=key_taxa_summary,
                metabolite_summary=metabolite_summary,
            )
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_3"])  # Longer for this section
        
        # Section title varies by therapy type
        if therapy_type == "ACT":
            section_title = "3. Microbiome–ACT Interaction Outlook"
        else:
            section_title = "3. Drug–Microbiome Interaction Outlook"
        
        return f"## {section_title}\n\n{content}\n\n"
    
    def generate_section_4(self, patient_data: Dict) -> Optional[str]:
        """
        Generate Section 4: Confounding Factors
        """
        logger.info("Generating Section 4: Confounding Factors")
        
        # Check if any confounding factors are present
        meds = patient_data["medications"]
        prior = patient_data["prior_treatments"]
        
        has_confounders = (
            meds["antibiotic_history"]["recent_antibiotics"] or
            meds["ppi_use"]["currently_on_ppi"] or
            prior["chemotherapy"]["received"] or
            prior["prior_immunotherapy"]["received"] or
            len(patient_data.get("comorbidities", [])) > 0
        )
        
        if not has_confounders:
            logger.info("No confounding factors present, omitting Section 4")
            return None
        
        # Retrieve evidence
        chunks = self.rag.retrieve_for_section_4(patient_data)
        
        if not chunks:
            logger.warning("No evidence retrieved for Section 4, omitting section")
            return None
        
        # Track citations
        for citation, title in self.rag.get_unique_citation_metadata(chunks):
            self.all_citations[citation] = title
        
        # Format evidence
        evidence = self.rag.format_chunks_for_llm(chunks)
        
        # Prepare confounding data string
        confounding_lines = []
        
        # Antibiotic history
        if meds["antibiotic_history"]["recent_antibiotics"]:
            confounding_lines.append("**Antibiotic Exposure:**")
            for exp in meds["antibiotic_history"]["exposures"]:
                confounding_lines.append(
                    f"- {exp['antibiotic_name']} ({exp['antibiotic_class']}): "
                    f"{exp['start_date']} to {exp['end_date']} "
                    f"({exp['days_before_ici']} days before ICI start)"
                )
        
        # PPI use
        if meds["ppi_use"]["currently_on_ppi"]:
            ppi = meds["ppi_use"]
            confounding_lines.append(f"**Proton Pump Inhibitor Use:**")
            confounding_lines.append(f"- {ppi['ppi_name']}, duration: {ppi['duration_months']} months")
        
        # Prior chemotherapy
        if prior["chemotherapy"]["received"]:
            chemo = prior["chemotherapy"]
            regimens_str = ", ".join(chemo["regimens"])
            confounding_lines.append(f"**Prior Chemotherapy:**")
            confounding_lines.append(f"- Regimens: {regimens_str}")
            confounding_lines.append(f"- Response: {chemo['response']}")
        
        # Prior immunotherapy
        if prior["prior_immunotherapy"]["received"]:
            prior_ici = prior["prior_immunotherapy"]
            drugs_str = ", ".join(prior_ici["drugs"])
            confounding_lines.append(f"**Prior Immunotherapy:**")
            confounding_lines.append(f"- Drugs: {drugs_str}")
            confounding_lines.append(f"- Response: {prior_ici['response']}")
        
        # Comorbidities
        if patient_data.get("comorbidities"):
            comorbidities_str = ", ".join(patient_data["comorbidities"])
            confounding_lines.append(f"**Comorbidities:** {comorbidities_str}")
        
        confounding_data_str = "\n".join(confounding_lines)
        
        # Build prompt
        prompt = build_prompt(
            "section_4",
            patient_data,
            evidence,
            confounding_data=confounding_data_str,
        )
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_4"])
        
        return f"## 4. Confounding Factors\n\n{content}\n\n"
    
    def generate_section_5(self, patient_data: Dict) -> Optional[str]:
        """
        Generate Section 5: Microbiota-Modulation Intervention Considerations
        """
        logger.info("Generating Section 5: Microbiota-Modulation Intervention Considerations")
        
        # Retrieve evidence for each intervention type
        intervention_chunks = self.rag.retrieve_for_section_5(patient_data)
        
        # Check if any intervention evidence was retrieved
        total_chunks = sum(len(chunks) for chunks in intervention_chunks.values())
        if total_chunks == 0:
            logger.warning("No intervention evidence retrieved for Section 5, omitting section")
            return None
        
        # Track citations from all intervention types
        for chunks in intervention_chunks.values():
            for citation, title in self.rag.get_unique_citation_metadata(chunks):
                self.all_citations[citation] = title
        
        # Format evidence for each intervention type
        evidence_str = "## Diet and Prebiotics Evidence\n\n"
        evidence_str += self.rag.format_chunks_for_llm(intervention_chunks.get("diet", []))
        evidence_str += "\n\n## Probiotics Evidence\n\n"
        evidence_str += self.rag.format_chunks_for_llm(intervention_chunks.get("probiotics", []))
        
        # Prepare microbiome summary
        key_bacteria = patient_data["microbiome"]["key_bacteria"]
        detected_taxa = [
            k.replace("_", " ").title()
            for k, v in key_bacteria.items()
            if v is not None and v > 0
        ]
        microbiome_summary = f"Detected taxa: {', '.join(detected_taxa[:5])}"
        
        # Build prompt
        prompt = build_prompt(
            "section_5",
            patient_data,
            evidence_str,
            microbiome_summary=microbiome_summary,
        )
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_5"])
        
        return f"## 5. Microbiota-Modulation Intervention Considerations\n\n{content}\n\n"
    
    def generate_section_6(self, patient_data: Dict) -> str:
        """
        Generate Section 6: Data Quality & Interpretive Limitations
        """
        logger.info("Generating Section 6: Data Quality & Interpretive Limitations")
        
        # This section doesn't use RAG, just data quality fields
        data_quality = patient_data["microbiome"]["data_quality"]
        metabolites = patient_data["microbiome"]["metabolites"]
        
        # Prepare data quality string
        dq_lines = [
            f"**Sequencing Method:** {patient_data['microbiome']['sequencing_method']}",
            f"**Data Completeness:** {data_quality['completeness']}",
            f"**Data Source:** {data_quality['source']}",
        ]
        
        if data_quality.get("limitations"):
            dq_lines.append(f"**Limitations:** {', '.join(data_quality['limitations'])}")
        
        # Note missing metabolite data
        missing_metabolites = []
        if not any(v is not None for v in metabolites["scfa"].values()):
            missing_metabolites.append("Short-chain fatty acids")
        if not metabolites["bile_acids_available"]:
            missing_metabolites.append("Bile acids")
        if not metabolites["tryptophan_metabolites_available"]:
            missing_metabolites.append("Tryptophan metabolites")
        
        if missing_metabolites:
            dq_lines.append(f"**Missing Metabolite Data:** {', '.join(missing_metabolites)}")
        
        data_quality_str = "\n".join(dq_lines)
        
        # Build prompt (no RAG evidence needed)
        from .prompts import SECTION_6_PROMPT
        prompt = SECTION_6_PROMPT.format(data_quality=data_quality_str)
        
        # Generate
        content = self.llm.generate(prompt, max_new_tokens=config.SECTION_MAX_NEW_TOKENS["section_6"])
        
        from .prompts import SECTION_6_FIXED_CAVEATS
        full_content = f"{SECTION_6_FIXED_CAVEATS}\n\n{content}"
        return f"## 6. Data Quality & Interpretive Limitations\n\n{full_content}\n\n"
    
    def get_all_citations(self) -> List[tuple]:
        """Return sorted list of all unique (citation, title) tuples used in the report"""
        return sorted(list(self.all_citations.items()))