Spaces:

ahuggingface01
/

tush1

Build error

App Files Files Community

ahuggingface01 commited on Feb 23

Commit

a5b6ba6

verified ·

1 Parent(s): 4f75e01

Upload 8 files

Browse files

Files changed (8) hide show

main.py +416 -0
predictor.py +275 -0
preprocess/__init__.py +0 -0
preprocess/wtts_builder.py +135 -0
rag/__init__.py +4 -0
rag/embedder.py +167 -0
rag/rag_pipeline.py +242 -0
rag/retriever.py +179 -0

main.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import asyncio
+import json
+import argparse
+import os
+import math
+# Imports from your specific file structure
+from src.preprocess.wtts_builder import WTTSBuilder
+from src.utils.data_loader import DataLoader
+# RAG pipeline imports (lazy — only used when --use_rag is set)
+RAG_AVAILABLE = False
+try:
+    from src.rag.embedder import WTTSEmbedder
+    from src.rag.rag_pipeline import RAGCRFExtractor
+    RAG_AVAILABLE = True
+except ImportError:
+    pass
+import google.generativeai as genai
+# --- CONFIG ---
+API_KEY = "AIzaSyAkgKha4IxsCjRXbeirhyoygT9Qmr4qYzU"
+# --- PROMPTS ---
+SKELETON_PROMPT = """
+You are a Clinical Data Specialist.
+Convert the Weighted Time Series (WTTS) below into a "Clinical Chronology Skeleton".
+INPUT (WTTS):
+{wtts_string}
+INSTRUCTIONS:
+1. Create a strict chronological timeline (Admission to Discharge).
+2. IMPORTANT: You MUST retain the [S_xx] ID for every event you list.
+3. Filter out "Routine" (Weight 0.1) events unless they indicate a status change.
+4. Keep exact values (e.g., "BP 90/60", "Temp 102.5").
+OUTPUT FORMAT:
+[Date] [S_xx]: Event details
+[Date] [S_xx]: Event details
+...
+"""
+EXTRACTION_PROMPT = """
+You are a Clinical Coding Expert.
+Review the Patient Skeleton and the Valid Options for the requested items.
+PATIENT SKELETON:
+{skeleton}
+TASK:
+For each Clinical Item listed below, determine the value AND the supporting Sentence ID.
+1. **Value**: Must come strictly from the "Valid Options" provided.
+2. **Evidence**: Must be the specific [S_xx] ID from the skeleton that proves the value.
+ITEMS TO EXTRACT & THEIR OPTIONS:
+{chunk_schema_json}
+OUTPUT FORMAT (JSON Object):
+{{
+  "item_name": {{
+    "value": "Selected Option",
+    "evidence": "S_xx",
+    "reasoning": "Brief explanation"
+  }},
+  ...
+}}
+"""
+def chunk_data(data, size):
+    """Yield successive n-sized chunks from list."""
+    for i in range(0, len(data), size):
+        yield data[i:i + size]
+async def generate_async(prompt, model, max_retries=3, initial_delay=1):
+    """Call Gemini via google-generativeai SDK (async-safe)."""
+    loop = asyncio.get_event_loop()
+    for attempt in range(max_retries):
+        try:
+            response = await loop.run_in_executor(
+                None,
+                lambda: model.generate_content(
+                    contents=prompt,
+                    generation_config=genai.GenerationConfig(
+                        response_mime_type="application/json"
+                    ),
+                )
+            )
+            try:
+                json_response = json.loads(response.text)
+                return json_response
+            except json.JSONDecodeError:
+                print(f"Generated content is not valid JSON. Retrying...")
+                continue
+        except Exception as e:
+            error_message = str(e)
+            if "429" in error_message or "500" in error_message:
+                if attempt < max_retries - 1:
+                    delay = initial_delay * (2 ** attempt)
+                    print(f"Rate limit / server error. Retrying in {delay}s...")
+                    await asyncio.sleep(delay)
+                else:
+                    print(f"Max retries reached.")
+                    return {"error": f"Max retries reached - {error_message}"}
+            else:
+                print(f"Error in generate_async: {error_message}")
+                return {"error": error_message}
+    return {"error": "Failed to generate valid JSON after multiple attempts"}
+async def process_patient(model, builder, patient_data, target_items, valid_options, semaphore):
+    """Executes the Two-Pass Pipeline for a single patient."""
+    async with semaphore:
+        pid = str(patient_data.get('document_id') or patient_data.get('patient_id')
+                  or patient_data.get('hadm_id') or 'unknown')
+        try:
+            # --- PHASE 1: WTTS Construction ---
+            wtts_string = builder.build_wtts_string(patient_data)
+            # --- PHASE 2: Skeleton Generation (Pass 1) ---
+            skeleton_input = SKELETON_PROMPT.format(wtts_string=wtts_string)
+            skeleton_resp = await generate_async(skeleton_input, model)
+            skeleton_text = str(skeleton_resp)
+            if isinstance(skeleton_resp, dict):
+                skeleton_text = json.dumps(skeleton_resp)
+            # --- PHASE 3: Extraction (Pass 2) ---
+            final_predictions = {}
+            item_chunks = list(chunk_data(target_items, 10))
+            for chunk_items in item_chunks:
+                chunk_schema = {
+                    item: valid_options.get(item, ["Yes", "No", "Unknown"])
+                    for item in chunk_items
+                }
+                extract_input = EXTRACTION_PROMPT.format(
+                    skeleton=skeleton_text,
+                    chunk_schema_json=json.dumps(chunk_schema)
+                )
+                chunk_resp = await generate_async(extract_input, model)
+                if isinstance(chunk_resp, dict):
+                    if 'error' in chunk_resp:
+                        print(f"  [WARN] LLM error for {pid}, chunk {chunk_items[:3]}...: {chunk_resp['error']}")
+                    else:
+                        final_predictions.update(chunk_resp)
+            return {
+                "patient_id": pid,
+                "skeleton_debug": skeleton_text[:500] + "...",
+                "predictions": final_predictions
+            }
+        except Exception as e:
+            print(f"Error processing {pid}: {e}")
+            return None
+# ---------------------------------------------------------------------------
+#  EVALUATION -- Accuracy & F1 Scoring
+# ---------------------------------------------------------------------------
+def _normalise(value):
+    """Lowercase + strip for fair comparison."""
+    if value is None:
+        return ""
+    return str(value).strip().lower()
+def evaluate_predictions(results, gt_path):
+    """
+    Compare pipeline results against dev_gt.jsonl.
+    Prints accuracy, macro-F1, per-item breakdown, and sample errors.
+    Returns (overall_dict, per_item_dict).
+    """
+    # --- Load GT ---
+    gt = {}
+    with open(gt_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            rec = json.loads(line)
+            doc_id = str(rec['document_id'])
+            gt[doc_id] = {a['item']: a['ground_truth'] for a in rec.get('annotations', [])}
+    # --- Build prediction lookup ---
+    preds = {}
+    for r in results:
+        doc_id = str(r.get('patient_id', 'unknown'))
+        items = {}
+        for item_name, item_val in r.get('predictions', {}).items():
+            if isinstance(item_val, dict):
+                items[item_name] = item_val.get('value', str(item_val))
+            else:
+                items[item_name] = str(item_val)
+        preds[doc_id] = items
+    # --- Collect all unique items ---
+    all_items = set()
+    for doc_items in gt.values():
+        all_items.update(doc_items.keys())
+    # --- Score ---
+    item_stats = {item: {'tp': 0, 'fp': 0, 'fn': 0, 'total': 0, 'correct': 0}
+                  for item in all_items}
+    total_comparisons = 0
+    total_correct = 0
+    matched_patients = 0
+    errors = []
+    for doc_id, gt_items in gt.items():
+        pred_items = preds.get(doc_id, {})
+        if pred_items:
+            matched_patients += 1
+        for item_name, gt_val in gt_items.items():
+            gt_norm = _normalise(gt_val)
+            pred_val = pred_items.get(item_name)
+            pred_norm = _normalise(pred_val) if pred_val is not None else ""
+            total_comparisons += 1
+            item_stats[item_name]['total'] += 1
+            if gt_norm == pred_norm:
+                total_correct += 1
+                item_stats[item_name]['correct'] += 1
+                item_stats[item_name]['tp'] += 1
+            else:
+                item_stats[item_name]['fn'] += 1
+                if pred_norm:
+                    item_stats[item_name]['fp'] += 1
+                errors.append((doc_id, item_name, gt_val,
+                               pred_val if pred_val is not None else '<MISSING>'))
+    accuracy = total_correct / total_comparisons if total_comparisons > 0 else 0.0
+    # --- Per-item P/R/F1 ---
+    f1s = []
+    per_item = {}
+    for item_name in sorted(all_items):
+        s = item_stats[item_name]
+        tp, fp, fn = s['tp'], s['fp'], s['fn']
+        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        f1   = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
+        item_acc = s['correct'] / s['total'] if s['total'] > 0 else 0.0
+        per_item[item_name] = {'accuracy': item_acc, 'precision': prec,
+                               'recall': rec, 'f1': f1, 'total': s['total']}
+        f1s.append(f1)
+    macro_f1 = sum(f1s) / len(f1s) if f1s else 0.0
+    # --- Print report ---
+    print("\n" + "=" * 70)
+    print("  CL4Health CRF Filling -- Evaluation Report")
+    print("=" * 70)
+    print(f"\n  GT Patients:       {len(gt)}")
+    print(f"  Pred Patients:     {len(preds)}")
+    print(f"  Matched Patients:  {matched_patients}")
+    print(f"\n  Total Comparisons: {total_comparisons}")
+    print(f"  Total Correct:     {total_correct}")
+    print(f"\n  {'Accuracy':>20s}: {accuracy:.4f}")
+    print(f"  {'Macro F1':>20s}: {macro_f1:.4f}")
+    # Top / bottom items
+    sorted_items = sorted(per_item.items(), key=lambda x: x[1]['f1'], reverse=True)
+    n_show = min(15, len(sorted_items))
+    print(f"\n  Top {n_show} Items by F1:")
+    print(f"  {'Item':<45s} {'Acc':>6s} {'P':>6s} {'R':>6s} {'F1':>6s}")
+    print(f"  {'-'*45} {'---':>6s} {'---':>6s} {'---':>6s} {'---':>6s}")
+    for name, s in sorted_items[:n_show]:
+        print(f"  {name:<45s} {s['accuracy']:>6.2f} {s['precision']:>6.2f} {s['recall']:>6.2f} {s['f1']:>6.2f}")
+    print(f"\n  Bottom {n_show} Items by F1:")
+    print(f"  {'Item':<45s} {'Acc':>6s} {'P':>6s} {'R':>6s} {'F1':>6s}")
+    print(f"  {'-'*45} {'---':>6s} {'---':>6s} {'---':>6s} {'---':>6s}")
+    for name, s in sorted_items[-n_show:]:
+        print(f"  {name:<45s} {s['accuracy']:>6.2f} {s['precision']:>6.2f} {s['recall']:>6.2f} {s['f1']:>6.2f}")
+    # Sample errors
+    if errors:
+        n_err = min(15, len(errors))
+        print(f"\n  Sample Mismatches ({n_err} of {len(errors)}):")
+        print(f"  {'DocID':<12s} {'Item':<40s} {'GT':<20s} {'Pred':<20s}")
+        print(f"  {'-'*12} {'-'*40} {'-'*20} {'-'*20}")
+        for doc_id, item, gt_v, pred_v in errors[:n_err]:
+            print(f"  {doc_id:<12s} {item:<40s} {str(gt_v):<20s} {str(pred_v):<20s}")
+    print("=" * 70)
+    return {'accuracy': round(accuracy, 4), 'macro_f1': round(macro_f1, 4)}, per_item
+# ---------------------------------------------------------------------------
+#  MAIN
+# ---------------------------------------------------------------------------
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--api_key", default=API_KEY,
+                        help="Google AI Studio API key")
+    parser.add_argument("--model_name", default="gemini-1.5-pro",
+                        help="Gemini model name")
+    parser.add_argument("--data_folders", nargs="+",
+                        default=[
+                            r"C:\Users\sai78\Desktop\Clinical_CRF_filling\data\raw\dyspnea-clinical-notes",
+                            r"C:\Users\sai78\Desktop\Clinical_CRF_filling\data\raw\dyspnea-crf-development",
+                        ],
+                        help="Directories containing .parquet shards (searched recursively)")
+    parser.add_argument("--gt_file",
+                        default=r"C:\Users\sai78\Desktop\Clinical_CRF_filling\data\raw\dev_gt.jsonl")
+    parser.add_argument("--options_folder",
+                        default=r"C:\Users\sai78\Desktop\Clinical_CRF_filling\data\raw\dyspnea-valid-options\dyspnea-valid-options\data")
+    parser.add_argument("--output_file",
+                        default="data/processed/materialized_ehr/submission.json")
+    parser.add_argument("--skip_eval", action="store_true",
+                        help="Skip evaluation after generating predictions")
+    parser.add_argument("--concurrency", type=int, default=5,
+                        help="Max concurrent LLM calls (free tier: keep at 5)")
+    # --- RAG options ---
+    parser.add_argument("--use_rag", action="store_true",
+                        help="Use RAG-guided extraction (retrieves relevant tuples per CRF item)")
+    parser.add_argument("--rag_top_k", type=int, default=15,
+                        help="Number of WTTS tuples to retrieve per CRF item group (RAG mode)")
+    parser.add_argument("--rag_model", type=str, default="all-MiniLM-L6-v2",
+                        help="SentenceTransformer model for embeddings (swap to clinical model on GPU)")
+    parser.add_argument("--rag_device", type=str, default="cpu",
+                        help="Device for embedding model: 'cpu' or 'cuda'")
+    args = parser.parse_args()
+    # 1. Setup — Configure Gemini API
+    genai.configure(api_key=args.api_key)
+    model = genai.GenerativeModel(args.model_name)
+    print(f"Using model: {args.model_name} (Google AI Studio)")
+    # Limit concurrency (free tier = 15 RPM, so keep low)
+    semaphore = asyncio.Semaphore(args.concurrency)
+    # 2. Load Data
+    loader = DataLoader(data_folders=args.data_folders, gt_path=args.gt_file)
+    target_items = loader.get_target_schema()
+    valid_options = loader.load_valid_options(args.options_folder)
+    merged_data = loader.load_and_merge()
+    if not merged_data:
+        print("No data found. Exiting.")
+        return
+    # 3. Process
+    builder = WTTSBuilder()
+    print(f"Starting pipeline for {len(merged_data)} patients...")
+    print(f"Schema: {len(target_items)} items per patient.")
+    if args.use_rag:
+        # --- RAG Pipeline ---
+        if not RAG_AVAILABLE:
+            print("ERROR: RAG dependencies not installed. Run:")
+            print("  pip install sentence-transformers faiss-cpu")
+            return
+        print(f"\n  [RAG MODE] Embedding model: {args.rag_model}")
+        print(f"  [RAG MODE] Device: {args.rag_device}")
+        print(f"  [RAG MODE] Top-k: {args.rag_top_k}\n")
+        embedder = WTTSEmbedder(model_name=args.rag_model, device=args.rag_device)
+        extractor = RAGCRFExtractor(
+            embedder=embedder,
+            generate_fn=generate_async,
+            top_k=args.rag_top_k,
+        )
+        tasks = [
+            extractor.extract_patient(
+                p, builder, target_items, valid_options, semaphore, model
+            )
+            for p in merged_data
+        ]
+    else:
+        # --- Original Two-Pass Pipeline ---
+        tasks = [
+            process_patient(model, builder, p, target_items, valid_options, semaphore)
+            for p in merged_data
+        ]
+    results = await asyncio.gather(*tasks)
+    results = [r for r in results if r is not None]
+    # 4. Save
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\nDone! {len(results)} results saved to {args.output_file}")
+    # 5. Evaluate against GT
+    if not args.skip_eval:
+        print("\nRunning evaluation against ground truth...")
+        overall, _ = evaluate_predictions(results, args.gt_file)
+        print(f"\n  >>> Final Accuracy: {overall['accuracy']:.4f}  |  Macro F1: {overall['macro_f1']:.4f}")
+if __name__ == "__main__":
+    asyncio.run(main())

predictor.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import json
+import time
+import asyncio
+import glob
+import argparse
+from argparse import ArgumentTypeError
+from pathlib import Path
+from tqdm import tqdm
+import google.generativeai as genai
+from typing import Dict, Optional
+import src.utils.utils as utils
+generation_config = {
+    "max_output_tokens": 8192,
+    "temperature": 1,
+    "top_p": 0.95,
+}
+def merge_jsonl_files(input_folder, output_file):
+    seen_records = set()
+    with open(output_file, 'w') as outfile:
+        for filename in glob.glob(f"{input_folder}/*.jsonl"):
+            with open(filename, 'r') as infile:
+                for line in infile:
+                    record = json.loads(line)
+                    unique_id = str(record.get('person_id', ''))
+                    if unique_id not in seen_records:
+                        outfile.write(line)
+                        seen_records.add(unique_id)
+    print(f"Merged {len(seen_records)} unique records into {output_file}")
+def get_context_path_safe(ehr_context):
+    """Convert ehr_context value to a path-safe string."""
+    if isinstance(ehr_context, int) or (isinstance(ehr_context, str) and ehr_context.isdigit()):
+        return f"context_{ehr_context}"
+    return str(ehr_context)
+def validate_ehr_context(value):
+    """
+    Custom validation for ehr_context argument.
+    Accepts either predefined strings or positive integers.
+    Returns:
+    - String values unchanged ("full", "last_five")
+    - Integer values as integers
+    - For path construction, numeric values are converted to "context_{value}" format
+    """
+    if value in ["full", "last_five"]:
+        return value
+    try:
+        context_length = int(value)
+        if context_length <= 0:
+            raise ArgumentTypeError(f"Context length must be positive, got {value}")
+        return context_length
+    except ValueError:
+        raise ArgumentTypeError(
+            f"Invalid ehr_context value: {value}. Must be either 'full', 'last_five'"
+            "or a positive integer for context length."
+        )
+async def generate_async(prompt, model, max_retries=3, initial_delay=1):
+    loop = asyncio.get_event_loop()
+    for attempt in range(max_retries):
+        try:
+            # Wrap the synchronous SDK call so it doesn't block the event loop
+            response = await loop.run_in_executor(
+                None,
+                lambda: model.generate_content(
+                    contents=prompt,
+                    generation_config={"response_mime_type": "application/json"},
+                    stream=False
+                )
+            )
+            try:
+                json_response = json.loads(response.text)
+                return json_response
+            except json.JSONDecodeError:
+                print(f"Generated content is not valid JSON. Retrying...")
+                continue
+        except Exception as e:
+            error_message = str(e)
+            if "500" in error_message:
+                if attempt < max_retries - 1:
+                    delay = initial_delay * (2 ** attempt)  # Exponential backoff
+                    print(f"Encountered 500 error. Retrying in {delay} seconds...")
+                    await asyncio.sleep(delay)
+                else:
+                    print(f"Max retries reached. Unable to generate content.")
+                    return {"error": f"Max retries reached - {error_message}"}
+            else:
+                print(f"Error in generate_async: {error_message}")
+                return {"error": error_message}
+    return {"error": "Failed to generate valid JSON content after multiple attempts"}
+async def process_single_timeline(timeline, args, model, semaphore, windowed_jsonl_file):
+    person_id = timeline.get('person_id', 'unknown')
+    print(f"Processing timeline for person_id: {person_id}")
+    try:
+        context_windows = utils.process_ehr_context(
+            timeline,
+            args.ehr_context,
+            windowed_jsonl_file
+        )
+        responses = []
+        for window in context_windows:
+            if args.prompt_method == "general":
+                prompt = utils.create_prompt_from_timeline(
+                    window,
+                    args.prompt_template
+                )
+                async with semaphore:
+                    response = await generate_async(prompt, model)
+                responses.append({
+                    "window_index": window.get("window_index", 0),
+                    "response": response,
+                    "window_token_count": window.get("window_token_count"),
+                    "window_percent_full": window.get("window_percent_full"),
+                    "start_date": window.get("start_date"),
+                    "end_date": window.get("end_date")
+                })
+                specialty = None
+                visit_occurrence_id = None
+            elif args.prompt_method == "persona":
+                prompt, specialty, visit_occurrence_id = utils.create_persona_prompt_from_timeline(
+                    window,
+                    args.prompt_template,
+                    person_id
+                )
+                async with semaphore:
+                    response = await generate_async(prompt, model)
+                responses.append({
+                    "window_index": window.get("window_index", 0),
+                    "response": response,
+                    "window_token_count": window.get("window_token_count"),
+                    "window_percent_full": window.get("window_percent_full"),
+                    "start_date": window.get("start_date"),
+                    "end_date": window.get("end_date")
+                })
+            else:
+                raise ValueError(f"Unknown prompt method: {args.prompt_method}")
+        print(f"Completed processing for person_id: {person_id} with {len(responses)} context windows")
+        return {
+            "person_id": person_id,
+            "visit_occurrence_id": visit_occurrence_id,
+            "responses": responses,
+            "specialty": specialty
+        }
+    except Exception as e:
+        print(f"Error processing timeline for person_id {person_id}: {str(e)}")
+        return None
+async def process_timelines(merged_jsonl_file, args, windowed_jsonl_file):
+    genai.configure(api_key=args.api_key)
+    model = genai.GenerativeModel(args.model_name)
+    semaphore = asyncio.Semaphore(args.max_concurrent_calls)
+    with open(merged_jsonl_file, 'r') as f:
+        timelines = [json.loads(line) for line in f]
+    total_timelines = min(len(timelines), args.max_samples) if args.max_samples else len(timelines)
+    print(f"Found {total_timelines} total timelines")
+    results = []
+    processed_count = 0
+    for timeline in tqdm(timelines[:total_timelines], total=total_timelines, desc="Processing timelines"):
+        try:
+            result = await process_single_timeline(timeline, args, model, semaphore, windowed_jsonl_file)
+            if result is not None:
+                results.append(result)
+            processed_count += 1
+            if processed_count % 10 == 0 or processed_count == total_timelines:
+                print(f"Processed {processed_count}/{total_timelines} samples")
+        except Exception as e:
+            print(f"Error processing timeline: {e}")
+    return results
+def save_single_response(resp_data: Dict, output_file: Path, person_id: str, template_name: Optional[str] = None):
+    try:
+        with open(output_file, 'w') as f:
+            json.dump(resp_data, f, indent=2)
+        print(f"Saved response for person_id: {person_id}" +
+              (f", template: {template_name}" if template_name else ""))
+    except Exception as e:
+        print(f"Error saving response for person_id: {person_id}" +
+              (f", template: {template_name}" if template_name else ""))
+        print(f"Error details: {str(e)}")
+def save_responses(responses, output_folder, prompt_method):
+    """
+    Save responses with window metadata including date ranges.
+    """
+    for response in responses:
+        person_id = response["person_id"]
+        for window_response in response["responses"]:
+            if prompt_method == "general":
+                output_file = Path(output_folder) / f"{person_id}_{window_response['window_index']}.json"
+            elif prompt_method == "persona":
+                specialty = response.get("specialty", "unknown")
+                output_file = Path(output_folder) / f"{person_id}_{specialty}_{window_response['window_index']}.json"
+            metadata = {
+                "person_id": person_id,
+                "window_index": window_response["window_index"],
+                "window_token_count": window_response["window_token_count"],
+                "window_percent_full": window_response["window_percent_full"],
+                "start_date": window_response["start_date"],
+                "end_date": window_response["end_date"],
+                "response_data": window_response["response"]
+            }
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            save_single_response(metadata, output_file, person_id)
+            print(f"Saved response for person_id: {person_id}, "
+                  f"window: {window_response['window_index']}, "
+                  f"dates: {window_response['start_date']} to {window_response['end_date']}")
+    print(f"Saved responses from {len(responses)} timelines to {output_folder}")
+async def main(args):
+    start_time = time.time()
+    merged_jsonl_file = Path(args.materialized_ehr_folder) / f"merged_timelines.jsonl"
+    windowed_dir = Path(args.materialized_ehr_folder) / "windowed"
+    windowed_dir.mkdir(parents=True, exist_ok=True)
+    windowed_jsonl_file = windowed_dir / f"windowed_timelines_{args.prompt_method}_{args.ehr_context}.jsonl"
+    merge_jsonl_files(args.materialized_ehr_folder, merged_jsonl_file)
+    print("Processing timelines...")
+    responses = await process_timelines(merged_jsonl_file, args, windowed_jsonl_file)
+    print(f"Processed {len(responses)} samples")
+    if args.max_samples:
+        print(f"Limited to {args.max_samples} samples for testing")
+    if responses:
+        context_path = get_context_path_safe(args.ehr_context)
+        output_folder = Path(args.output_folder) / args.model_name / args.prompt_method / context_path / Path(args.prompt_template).stem
+        save_responses(responses, output_folder, args.prompt_method)
+        total_time = time.time() - start_time
+        print(f"Total time taken: {total_time:.2f} seconds")
+        print(f"Average time per sample: {total_time / len(responses):.2f} seconds")
+    else:
+        print("No responses were generated. Check the logs for more information.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate synthetic instruction-response pairs from materialized patient timelines using async processing.")
+    parser.add_argument("--materialized_ehr_folder", type=str, help="Folder containing materialized EHR JSONL files")
+    parser.add_argument("--prompt_template", type=str)
+    parser.add_argument("--prompt_method", type=str, default="persona", help="Prompt method: general, persona")
+    parser.add_argument("--ehr_context",type=validate_ehr_context,default=16384,help="EHR context to use: 'full', 'last_five', or a positive integer for context length (e.g., 8192)")
+    parser.add_argument("--output_folder", type=str)
+    parser.add_argument("--project_id", type=str)
+    parser.add_argument("--location", type=str, default="us-central1", help="Google Cloud location")
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--max_samples", type=int)
+    parser.add_argument("--max_concurrent_calls")
+    parser.add_argument("--csv_file")
+    parser.add_argument("--dataset_id")
+    parser.add_argument("--table_id", type=str)
+    args = parser.parse_args()
+    asyncio.run(main(args))

preprocess/__init__.py ADDED Viewed

File without changes

preprocess/wtts_builder.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import re
+import json
+import os
+import pandas as pd
+import argparse
+from datetime import datetime
+from src.utils.data_loader import DataLoader
+class WTTSBuilder:
+    def __init__(self):
+        # --- WEIGHTING RULES (W) ---
+        self.critical_patterns = [
+            r'respiratory failure', r'seizure', r'cardiac arrest', r'intubat',
+            r'abnormal', r'critical', r'hemorrhage', r'positive',
+            r'emergency', r'acute', r'hypoxia', r'flagged', r'icu',
+            r'dyspnea', r'shortness of breath', r'sob', r'mrc grade', r'nyha'
+        ]
+        self.chronic_patterns = [
+            r'history of', r'chronic', r'stable', r'continued',
+            r'maintained', r'diagnosed with', r'previous', r'known'
+        ]
+        self.routine_patterns = [
+            r'routine', r'normal', r'negative', r'unremarkable',
+            r'no acute', r'clear', r'regular diet', r'resting'
+        ]
+    def _get_normalized_time(self, event_time_str, admit_str, disch_str):
+        """Calculates P_j (0.0 to 1.0)"""
+        try:
+            e_dt = pd.to_datetime(event_time_str)
+            a_dt = pd.to_datetime(admit_str)
+            d_dt = pd.to_datetime(disch_str)
+            total_duration = (d_dt - a_dt).total_seconds()
+            elapsed = (e_dt - a_dt).total_seconds()
+            if total_duration <= 0: return 1.0
+            return round(max(0.0, min(1.0, elapsed / total_duration)), 2)
+        except:
+            return 0.5
+    def _get_weight(self, text):
+        t = text.lower()
+        if any(re.search(p, t) for p in self.critical_patterns): return 1.0
+        if any(re.search(p, t) for p in self.chronic_patterns): return 0.5
+        if any(re.search(p, t) for p in self.routine_patterns): return 0.1
+        return 0.5
+    def _extract_sentences_with_ids(self, text, start_index):
+        """
+        Splits notes and assigns UNIQUE IDs.
+        CRITICAL FIX: Sanitizes newlines to preserve WTTS structure.
+        """
+        # 1. Replace newlines/tabs with spaces to keep tuple on one line
+        text = text.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
+        # 2. Remove de-id brackets
+        text = re.sub(r'\[\*\*.*?\*\*\]', '', text)
+        # 3. Split by sentence boundaries
+        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
+        results = []
+        current_idx = start_index
+        for s in sentences:
+            clean_s = s.strip()
+            # Ignore very short/empty fragments
+            if len(clean_s) > 5:
+                sid = f"S_{current_idx}"
+                results.append((sid, clean_s))
+                current_idx += 1
+        return results, current_idx
+    def build_wtts_string(self, patient_data):
+        tuples = []
+        admit = patient_data.get('admission_time')
+        disch = patient_data.get('discharge_time')
+        sorted_notes = sorted(patient_data.get('notes', []), key=lambda x: x['timestamp'])
+        global_sent_idx = 0
+        for note in sorted_notes:
+            raw_ts = note['timestamp']
+            p_j = self._get_normalized_time(raw_ts, admit, disch)
+            events, global_sent_idx = self._extract_sentences_with_ids(note['text'], global_sent_idx)
+            for (sid, event) in events:
+                w = self._get_weight(event)
+                # Format: [ID] ("Date", "Event", P_j, W)
+                tuples.append(f'[{sid}] ("{raw_ts}", "{event}", {p_j}, {w})')
+        return " | ".join(tuples)
+# --- EXECUTION LOGIC ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process raw clinical notes into WTTS format.")
+    # Set defaults for easier running
+    parser.add_argument("--input_dirs", nargs="+",
+                        default=[
+                            "data/raw/dyspnea-clinical-notes",
+                            "data/raw/dyspnea-crf-development",
+                        ],
+                        help="Directories containing .parquet shards (searched recursively)")
+    parser.add_argument("--gt_file", type=str,
+                        default="data/raw/dev_gt.jsonl",
+                        help="Path to the ground truth JSONL file.")
+    parser.add_argument("--output_dir", type=str,
+                        default="data/processed/materialized_ehr",
+                        help="Path to store processed JSONL files.")
+    args = parser.parse_args()
+    loader = DataLoader(data_folders=args.input_dirs, gt_path=args.gt_file)
+    builder = WTTSBuilder()
+    patients = loader.load_and_merge()
+    os.makedirs(args.output_dir, exist_ok=True)
+    if not patients:
+        print("No patients found! Check paths.")
+    else:
+        print(f"Materializing timelines for {len(patients)} patients...")
+        for p in patients:
+            wtts_output = builder.build_wtts_string(p)
+            # FIX: Prioritize document_id to match DataLoader logic
+            pid = str(p.get('document_id') or p.get('patient_id') or p.get('hadm_id'))
+            output_path = os.path.join(args.output_dir, f"{pid}.jsonl")
+            with open(output_path, 'w') as f:
+                json.dump({"person_id": pid, "text": wtts_output}, f)
+        print(f"Successfully stored outputs in: {args.output_dir}")

rag/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# RAG-guided CRF extraction module
+from src.rag.embedder import WTTSEmbedder
+from src.rag.retriever import WTTSRetriever
+from src.rag.rag_pipeline import RAGCRFExtractor

rag/embedder.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+WTTS Tuple Embedder — Embeds clinical event tuples and CRF queries
+into vector space using SentenceTransformers.
+Swap the model_name to a clinical model (e.g., MedCPT) when GPU is available.
+"""
+import re
+import numpy as np
+from typing import List, Dict, Optional
+from sentence_transformers import SentenceTransformer
+class WTTSEmbedder:
+    """Embeds WTTS tuples and CRF item queries into dense vectors."""
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str = "cpu"):
+        """
+        Args:
+            model_name: SentenceTransformer model ID.
+                        CPU default: 'all-MiniLM-L6-v2' (384-dim, fast)
+                        GPU clinical: 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb'
+                                      or 'medicalai/ClinicalBERT'
+            device: 'cpu' or 'cuda'
+        """
+        print(f"Loading embedding model: {model_name} on {device}...")
+        self.model = SentenceTransformer(model_name, device=device)
+        self.device = device
+        self.embedding_dim = self.model.get_sentence_embedding_dimension()
+        print(f"  Embedding dimension: {self.embedding_dim}")
+    # ------------------------------------------------------------------
+    #  Parse WTTS string → structured list of tuple dicts
+    # ------------------------------------------------------------------
+    def parse_wtts_string(self, wtts_string: str) -> List[Dict]:
+        """
+        Parses the WTTS pipe-delimited string back into structured dicts.
+        Input format:  [S_0] ("2026-01-01", "event text", 0.5, 1.0) | [S_1] (...)
+        Output: [
+            {"sid": "S_0", "timestamp": "2026-01-01", "event": "event text", "p_j": 0.5, "weight": 1.0},
+            ...
+        ]
+        """
+        tuples = []
+        # Split by pipe separator
+        raw_entries = wtts_string.split(" | ")
+        # Pattern to extract: [S_xx] ("timestamp", "event", P_j, W)
+        pattern = re.compile(
+            r'\[(?P<sid>S_\d+)\]\s*\('
+            r'"(?P<timestamp>[^"]*)",\s*'
+            r'"(?P<event>[^"]*)",\s*'
+            r'(?P<p_j>[\d.]+),\s*'
+            r'(?P<weight>[\d.]+)\)'
+        )
+        for entry in raw_entries:
+            entry = entry.strip()
+            if not entry:
+                continue
+            match = pattern.search(entry)
+            if match:
+                tuples.append({
+                    "sid": match.group("sid"),
+                    "timestamp": match.group("timestamp"),
+                    "event": match.group("event"),
+                    "p_j": float(match.group("p_j")),
+                    "weight": float(match.group("weight")),
+                })
+        return tuples
+    # ------------------------------------------------------------------
+    #  Embed tuple event texts
+    # ------------------------------------------------------------------
+    def embed_tuples(self, tuples: List[Dict]) -> np.ndarray:
+        """
+        Embed the event text from each WTTS tuple.
+        Args:
+            tuples: List of parsed tuple dicts (from parse_wtts_string)
+        Returns:
+            np.ndarray of shape (n_tuples, embedding_dim)
+        """
+        if not tuples:
+            return np.array([])
+        texts = [t["event"] for t in tuples]
+        embeddings = self.model.encode(
+            texts,
+            show_progress_bar=False,
+            normalize_embeddings=True,  # L2-normalize for cosine similarity via dot product
+            batch_size=64,
+        )
+        return np.array(embeddings, dtype=np.float32)
+    # ------------------------------------------------------------------
+    #  Embed a CRF item query
+    # ------------------------------------------------------------------
+    def embed_query(self, crf_item: str, valid_options: Optional[List[str]] = None) -> np.ndarray:
+        """
+        Create an embedding for a CRF item query.
+        Combines the item name with its valid options to create a richer query.
+        Args:
+            crf_item: e.g., "mrc_grade" or "administration of bronchodilators"
+            valid_options: e.g., ["y", "n", "unknown"]
+        Returns:
+            np.ndarray of shape (embedding_dim,)
+        """
+        # Build a descriptive query string
+        query_parts = [crf_item.replace("_", " ")]
+        if valid_options:
+            # Add option context to help embedding understand what we're looking for
+            opts_str = ", ".join(str(o) for o in valid_options[:10])  # limit to avoid huge queries
+            query_parts.append(f"options: {opts_str}")
+        query_text = " | ".join(query_parts)
+        embedding = self.model.encode(
+            [query_text],
+            show_progress_bar=False,
+            normalize_embeddings=True,
+        )
+        return np.array(embedding[0], dtype=np.float32)
+    # ------------------------------------------------------------------
+    #  Batch embed multiple CRF queries at once
+    # ------------------------------------------------------------------
+    def embed_queries_batch(
+        self,
+        crf_items: List[str],
+        valid_options_map: Dict[str, List[str]]
+    ) -> Dict[str, np.ndarray]:
+        """
+        Embed all CRF items in one batch for efficiency.
+        Returns:
+            Dict mapping crf_item name → embedding vector
+        """
+        query_texts = []
+        item_names = []
+        for item in crf_items:
+            item_names.append(item)
+            parts = [item.replace("_", " ")]
+            opts = valid_options_map.get(item, [])
+            if opts:
+                opts_str = ", ".join(str(o) for o in opts[:10])
+                parts.append(f"options: {opts_str}")
+            query_texts.append(" | ".join(parts))
+        embeddings = self.model.encode(
+            query_texts,
+            show_progress_bar=False,
+            normalize_embeddings=True,
+            batch_size=64,
+        )
+        return {
+            name: np.array(emb, dtype=np.float32)
+            for name, emb in zip(item_names, embeddings)
+        }

rag/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+RAG CRF Extractor — Orchestrates the full RAG-guided pipeline:
+    1. Build WTTS tuples from patient data
+    2. Embed all tuples into FAISS index
+    3. For each CRF item: retrieve relevant tuples → LLM extraction
+    4. Return predictions
+This replaces the two-pass (Skeleton → Extraction) approach in main.py.
+"""
+import asyncio
+import json
+from typing import Dict, List, Optional, Any
+from src.rag.embedder import WTTSEmbedder
+from src.rag.retriever import WTTSRetriever
+# ---------------------------------------------------------------------------
+#  RAG-optimized prompt — shorter, focused on retrieved evidence only
+# ---------------------------------------------------------------------------
+RAG_EXTRACTION_PROMPT = """\
+You are a Clinical Coding Expert.
+RETRIEVED CLINICAL EVIDENCE (sorted chronologically, most relevant events for these items):
+{retrieved_evidence}
+TASK:
+For each Clinical Item below, determine the value based ONLY on the evidence above.
+1. **Value**: Must come strictly from the "Valid Options".
+2. **Evidence**: Cite the [S_xx] ID that supports your choice.
+3. If no evidence supports any option, choose "unknown".
+ITEMS TO EXTRACT & THEIR OPTIONS:
+{chunk_schema_json}
+OUTPUT FORMAT (JSON Object):
+{{
+  "item_name": {{
+    "value": "Selected Option",
+    "evidence": "S_xx",
+    "reasoning": "Brief explanation"
+  }},
+  ...
+}}
+"""
+class RAGCRFExtractor:
+    """
+    Orchestrates RAG-guided CRF extraction for clinical notes.
+    Replaces the two-pass (Skeleton → Extraction) pipeline with
+    per-item retrieval for focused, temporally-ordered evidence.
+    """
+    def __init__(
+        self,
+        embedder: WTTSEmbedder,
+        generate_fn,
+        top_k: int = 15,
+        weight_boost: float = 0.3,
+        items_per_chunk: int = 5,
+    ):
+        """
+        Args:
+            embedder: WTTSEmbedder instance (shared across patients)
+            generate_fn: Async function to call LLM (the generate_async from main.py)
+            top_k: Number of tuples to retrieve per CRF item group
+            weight_boost: Re-ranking boost for critical events
+            items_per_chunk: How many CRF items to group per LLM call.
+                             Grouped items share the same retrieved evidence pool.
+        """
+        self.embedder = embedder
+        self.generate_fn = generate_fn
+        self.top_k = top_k
+        self.weight_boost = weight_boost
+        self.items_per_chunk = items_per_chunk
+    # ------------------------------------------------------------------
+    #  Group CRF items by semantic similarity for batched retrieval
+    # ------------------------------------------------------------------
+    def _group_crf_items(
+        self,
+        target_items: List[str],
+        valid_options: Dict[str, List[str]],
+    ) -> List[List[str]]:
+        """
+        Group CRF items into chunks. Items in the same chunk will share
+        a combined retrieval query, so similar items get grouped together.
+        For now: simple sequential chunking (items_per_chunk at a time).
+        Future: cluster by embedding similarity of item names.
+        """
+        chunks = []
+        for i in range(0, len(target_items), self.items_per_chunk):
+            chunk = target_items[i : i + self.items_per_chunk]
+            chunks.append(chunk)
+        return chunks
+    # ------------------------------------------------------------------
+    #  Build combined query for a group of CRF items
+    # ------------------------------------------------------------------
+    def _build_group_query(
+        self,
+        items: List[str],
+        valid_options: Dict[str, List[str]],
+    ):
+        """
+        Create a combined query embedding for a group of CRF items.
+        Averages the individual item query embeddings.
+        """
+        query_embeddings = self.embedder.embed_queries_batch(items, valid_options)
+        # Average the embeddings for a combined query
+        import numpy as np
+        all_embs = list(query_embeddings.values())
+        combined = np.mean(all_embs, axis=0).astype(np.float32)
+        # Re-normalize after averaging
+        norm = np.linalg.norm(combined)
+        if norm > 0:
+            combined = combined / norm
+        return combined
+    # ------------------------------------------------------------------
+    #  Main extraction method — full RAG pipeline for one patient
+    # ------------------------------------------------------------------
+    async def extract_patient(
+        self,
+        patient_data: Dict,
+        builder,  # WTTSBuilder instance
+        target_items: List[str],
+        valid_options: Dict[str, List[str]],
+        semaphore: asyncio.Semaphore,
+        model: Any = None,  # Gemini model (passed to generate_fn)
+    ) -> Optional[Dict]:
+        """
+        Full RAG pipeline for a single patient:
+            1. Build WTTS string
+            2. Parse & embed tuples → FAISS index
+            3. For each CRF item group: retrieve → prompt → extract
+            4. Return predictions
+        Args:
+            patient_data: Merged patient dict from DataLoader
+            builder: WTTSBuilder instance
+            target_items: List of CRF item names to extract
+            valid_options: Dict mapping item name → list of valid values
+            semaphore: Concurrency limiter for LLM calls
+            model: Gemini model instance
+        Returns:
+            Dict with patient_id, predictions, and debug info
+        """
+        pid = str(
+            patient_data.get('document_id')
+            or patient_data.get('patient_id')
+            or patient_data.get('hadm_id')
+            or 'unknown'
+        )
+        try:
+            # --- Step 1: Build WTTS tuples ---
+            wtts_string = builder.build_wtts_string(patient_data)
+            if not wtts_string.strip():
+                print(f"  [{pid}] No WTTS tuples generated, skipping.")
+                return None
+            # --- Step 2: Parse and embed tuples ---
+            tuples = self.embedder.parse_wtts_string(wtts_string)
+            if not tuples:
+                print(f"  [{pid}] Failed to parse WTTS tuples, skipping.")
+                return None
+            tuple_embeddings = self.embedder.embed_tuples(tuples)
+            # --- Step 3: Build FAISS index for this patient ---
+            retriever = WTTSRetriever(self.embedder)
+            retriever.build_index(tuples, tuple_embeddings)
+            print(f"  [{pid}] Indexed {len(tuples)} tuples. "
+                  f"Retrieving for {len(target_items)} CRF items...")
+            # --- Step 4: Group CRF items and extract ---
+            item_groups = self._group_crf_items(target_items, valid_options)
+            final_predictions = {}
+            for group_items in item_groups:
+                # Build combined query for this group
+                group_query = self._build_group_query(group_items, valid_options)
+                # Retrieve relevant tuples
+                retrieved = retriever.retrieve_and_rerank(
+                    group_query,
+                    top_k=self.top_k,
+                    weight_boost=self.weight_boost,
+                )
+                # Format for LLM
+                evidence_str = WTTSRetriever.format_retrieved_tuples(retrieved)
+                chunk_schema = {
+                    item: valid_options.get(item, ["y", "n", "unknown"])
+                    for item in group_items
+                }
+                prompt = RAG_EXTRACTION_PROMPT.format(
+                    retrieved_evidence=evidence_str,
+                    chunk_schema_json=json.dumps(chunk_schema, indent=2),
+                )
+                # Call LLM with concurrency control
+                async with semaphore:
+                    response = await self.generate_fn(prompt, model)
+                if isinstance(response, dict):
+                    if "error" in response:
+                        print(f"  [{pid}] LLM error for items {group_items[:2]}...: "
+                              f"{response['error']}")
+                    else:
+                        final_predictions.update(response)
+            # --- Step 5: Return results ---
+            return {
+                "patient_id": pid,
+                "predictions": final_predictions,
+                "rag_debug": {
+                    "total_tuples": len(tuples),
+                    "top_k": self.top_k,
+                    "item_groups": len(item_groups),
+                },
+            }
+        except Exception as e:
+            print(f"  [{pid}] RAG extraction error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None

rag/retriever.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+WTTS Retriever — Builds per-patient FAISS index and retrieves
+relevant tuples per CRF item with weight-based re-ranking
+and P_j temporal sorting.
+"""
+import numpy as np
+import faiss
+from typing import List, Dict, Optional, Tuple
+from src.rag.embedder import WTTSEmbedder
+class WTTSRetriever:
+    """Per-patient FAISS index for retrieving relevant WTTS tuples."""
+    def __init__(self, embedder: WTTSEmbedder):
+        self.embedder = embedder
+        self.index: Optional[faiss.IndexFlatIP] = None  # Inner product (cosine on normalized vecs)
+        self.tuples: List[Dict] = []
+        self.embeddings: Optional[np.ndarray] = None
+    # ------------------------------------------------------------------
+    #  Build index for one patient's tuples
+    # ------------------------------------------------------------------
+    def build_index(self, tuples: List[Dict], embeddings: np.ndarray):
+        """
+        Build a FAISS index from pre-computed tuple embeddings.
+        Args:
+            tuples: Parsed WTTS tuple dicts
+            embeddings: np.ndarray of shape (n_tuples, embedding_dim)
+        """
+        self.tuples = tuples
+        self.embeddings = embeddings
+        if len(tuples) == 0:
+            self.index = None
+            return
+        dim = embeddings.shape[1]
+        # Use Inner Product (IP) since embeddings are L2-normalized
+        # This makes IP equivalent to cosine similarity
+        self.index = faiss.IndexFlatIP(dim)
+        self.index.add(embeddings)
+    # ------------------------------------------------------------------
+    #  Raw retrieval (top-k by cosine similarity)
+    # ------------------------------------------------------------------
+    def retrieve(self, query_embedding: np.ndarray, top_k: int = 15) -> List[Dict]:
+        """
+        Retrieve top-k most similar tuples to the query.
+        Args:
+            query_embedding: 1D vector of shape (embedding_dim,)
+            top_k: Number of tuples to retrieve
+        Returns:
+            List of tuple dicts with added 'similarity_score' field
+        """
+        if self.index is None or len(self.tuples) == 0:
+            return []
+        # Clamp top_k to available tuples
+        top_k = min(top_k, len(self.tuples))
+        # FAISS expects 2D input
+        query = query_embedding.reshape(1, -1).astype(np.float32)
+        scores, indices = self.index.search(query, top_k)
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx == -1:  # FAISS returns -1 for missing results
+                continue
+            result = self.tuples[idx].copy()
+            result["similarity_score"] = float(score)
+            results.append(result)
+        return results
+    # ------------------------------------------------------------------
+    #  Retrieve + re-rank by weight + sort by P_j
+    # ------------------------------------------------------------------
+    def retrieve_and_rerank(
+        self,
+        query_embedding: np.ndarray,
+        top_k: int = 15,
+        weight_boost: float = 0.3,
+        fetch_multiplier: int = 3,
+    ) -> List[Dict]:
+        """
+        Retrieve, re-rank using weight W, then sort by P_j for temporal order.
+        Strategy:
+            1. Over-fetch (top_k * fetch_multiplier) candidates from FAISS
+            2. Re-score: final_score = similarity + weight_boost * W
+            3. Take top_k by final_score
+            4. Sort the final set by P_j (ascending) to preserve temporal order
+        Args:
+            query_embedding: 1D vector
+            top_k: Final number of tuples to return
+            weight_boost: How much to boost critical events (W=1.0 gets +0.3)
+            fetch_multiplier: How many extra candidates to fetch for re-ranking
+        Returns:
+            List of tuple dicts sorted by P_j (temporal order),
+            each with 'similarity_score', 'rerank_score' fields
+        """
+        if self.index is None or len(self.tuples) == 0:
+            return []
+        # Step 1: Over-fetch candidates
+        fetch_k = min(top_k * fetch_multiplier, len(self.tuples))
+        candidates = self.retrieve(query_embedding, top_k=fetch_k)
+        # Step 2: Re-rank with weight boost
+        for candidate in candidates:
+            sim = candidate["similarity_score"]
+            w = candidate.get("weight", 0.5)
+            candidate["rerank_score"] = sim + (weight_boost * w)
+        # Step 3: Take top_k by re-rank score
+        candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
+        top_candidates = candidates[:top_k]
+        # Step 4: Sort by P_j (temporal order) — THIS is what preserves continuity
+        top_candidates.sort(key=lambda x: x.get("p_j", 0.5))
+        return top_candidates
+    # ------------------------------------------------------------------
+    #  Format retrieved tuples back into a readable string for the LLM
+    # ------------------------------------------------------------------
+    @staticmethod
+    def format_retrieved_tuples(tuples: List[Dict]) -> str:
+        """
+        Format retrieved tuples into a clean string for the LLM prompt.
+        Sorted by P_j (temporal order) by this point.
+        Output format:
+            [S_14] (EARLY | W:HIGH) "Patient reports increasing dyspnea"
+            [S_80] (MID   | W:HIGH) "SpO2 dropped to 85%, intubated"
+            [S_155](LATE  | W:MED)  "Stable on room air at discharge"
+        """
+        if not tuples:
+            return "(No relevant clinical events found)"
+        lines = []
+        for t in tuples:
+            p_j = t.get("p_j", 0.5)
+            w = t.get("weight", 0.5)
+            # Temporal phase label
+            if p_j <= 0.15:
+                phase = "ADMISSION"
+            elif p_j <= 0.35:
+                phase = "EARLY"
+            elif p_j <= 0.65:
+                phase = "MID"
+            elif p_j <= 0.85:
+                phase = "LATE"
+            else:
+                phase = "DISCHARGE"
+            # Weight label
+            if w >= 0.8:
+                w_label = "CRITICAL"
+            elif w >= 0.4:
+                w_label = "MODERATE"
+            else:
+                w_label = "ROUTINE"
+            sid = t.get("sid", "S_?")
+            event = t.get("event", "")
+            ts = t.get("timestamp", "")
+            lines.append(f'[{sid}] ({phase} | {w_label}) [{ts}] "{event}"')
+        return "\n".join(lines)