Upload 6 files

Browse files

Files changed (6) hide show

src/data_processing.py +201 -0
src/evaluation_utils.py +67 -0
src/evaluator.py +113 -0
src/retriever.py +172 -0
src/simulator.py +247 -0
src/synthetic_generator.py +125 -0

src/data_processing.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import re
+import pandas as pd
+import numpy as np
+import torch
+from docx import Document
+from sentence_transformers import SentenceTransformer
+from datasets import Dataset
+from tqdm import tqdm
+def read_docx(file_path):
+    """Reads text content from a .docx file."""
+    try:
+        doc = Document(file_path)
+        return '\n'.join(para.text for para in doc.paragraphs)
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return ""
+def extract_qa_pairs(text):
+    """Extracts alternating Examiner and Examinee Q&A pairs from text."""
+    pattern = re.compile(r"\*\*Examiner:\*\*(.*?)\n\n\*\*Examinee:\*\*(.*?)(?=\n\n\*\*Examiner:\*\*|$)", re.DOTALL)
+    return [{"question": q.strip(), "answer": a.strip()} for q, a in pattern.findall(text)]
+def parse_filename(filename):
+    """Parses case ID and topic from BTK filename format."""
+    # Example: BTK_-_77A___Burn.docx -> case_id = 77A, clinical_presentation = Burn
+    base = os.path.splitext(filename)[0]
+    match = re.match(r"BTK_-_(\d+[A-Z]?)___(.+)", base)
+    if match:
+        case_id = match.group(1)
+        topic = match.group(2).replace("_", " ").strip()
+    else:
+        # Handle potential variations or log unknown formats if needed
+        print(f"Warning: Could not parse filename format: {filename}")
+        case_id, topic = "Unknown", "Unknown"
+    return case_id, topic
+def process_all_cases(folder_path):
+    """Reads all .docx files in a folder and structures them into a DataFrame."""
+    rows = []
+    if not os.path.isdir(folder_path):
+        print(f"Error: Folder not found at {folder_path}")
+        return pd.DataFrame(rows)
+    print(f"Processing case files from: {folder_path}")
+    for filename in os.listdir(folder_path):
+        if filename.lower().endswith('.docx') and not filename.startswith('~'): # Avoid temp files
+            file_path = os.path.join(folder_path, filename)
+            text = read_docx(file_path)
+            if text:
+                qa_pairs = extract_qa_pairs(text)
+                case_id, presentation = parse_filename(filename)
+                if not qa_pairs:
+                     print(f"Warning: No Q&A pairs extracted from {filename}")
+                for i, pair in enumerate(qa_pairs):
+                    rows.append({
+                        "case_id": case_id,
+                        "clinical_presentation": presentation,
+                        "turn_id": i + 1,
+                        "question": pair["question"],
+                        "answer": pair["answer"]
+                    })
+            else:
+                print(f"Warning: Empty content for file {filename}")
+    if not rows:
+         print("Warning: No data rows were generated. Check input files and formats.")
+    return pd.DataFrame(rows)
+# --- ClinicalCaseProcessor Class ---
+class ClinicalCaseProcessor:
+    """Handles preprocessing of clinical cases for the RAG system using sentence-transformers."""
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        print(f"Initializing ClinicalCaseProcessor with model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        self.model.to(self.device)
+    def preprocess_data(self, input_data, output_path="./processed_clinical_cases", batch_size=16):
+        """
+        Convert raw case data (DataFrame or path to CSV) into a vectorized Hugging Face dataset.
+        Args:
+            input_data: DataFrame or path to CSV file with clinical cases.
+            output_path: Where to save the processed Hugging Face dataset.
+            batch_size: Batch size for embedding generation.
+        Returns:
+            datasets.Dataset: The processed dataset with embeddings.
+        """
+        # Load data
+        if isinstance(input_data, pd.DataFrame):
+            df = input_data
+            print("Using provided DataFrame.")
+        elif isinstance(input_data, str) and os.path.exists(input_data):
+            try:
+                df = pd.read_csv(input_data)
+                print(f"Data loaded from CSV: {input_data}")
+            except Exception as e:
+                print(f"Error loading CSV {input_data}: {e}")
+                return None
+        else:
+             print(f"Error: Invalid input_data type or path does not exist: {input_data}")
+             return None
+        if df.empty:
+            print("Error: Input DataFrame is empty. Cannot process.")
+            return None
+        print(f"Raw data shape: {df.shape}")
+        # Validate necessary columns
+        required_cols = ['case_id', 'clinical_presentation', 'turn_id', 'question', 'answer']
+        if not all(col in df.columns for col in required_cols):
+            print(f"Error: DataFrame missing required columns. Found: {df.columns}. Required: {required_cols}")
+            return None
+        # Group by case_id to get all Q&A pairs for each case
+        grouped = df.groupby(['case_id', 'clinical_presentation'], dropna=False)
+        # Create a new dataframe with one row per case
+        case_data = []
+        print("Grouping data by case...")
+        for (case_id, presentation), group in tqdm(grouped, desc="Processing Cases"):
+            # Sort by turn_id to ensure correct order
+            group = group.sort_values('turn_id')
+            # Extract questions and answers in order
+            questions = group['question'].tolist()
+            answers = group['answer'].tolist()
+            # Handle potential NaN/None in presentation if groupby didn't drop them
+            presentation_str = str(presentation) if pd.notna(presentation) else "Unknown Presentation"
+            case_data.append({
+                'case_id': str(case_id) if pd.notna(case_id) else "Unknown ID",
+                'clinical_presentation': presentation_str,
+                'questions': questions,
+                'answers': answers
+            })
+        if not case_data:
+            print("Error: No cases could be processed after grouping. Check input data integrity.")
+            return None
+        processed_df = pd.DataFrame(case_data)
+        print(f"Processed data into {len(processed_df)} unique cases.")
+        # Create a searchable summary of each case (handle empty question lists)
+        processed_df['case_summary'] = processed_df.apply(
+            lambda x: f"Clinical case: {x['clinical_presentation']}. First question: {x['questions'][0] if x['questions'] else 'No questions available'}",
+            axis=1
+        )
+        # Generate embeddings using sentence-transformers
+        texts_to_embed = processed_df['case_summary'].tolist()
+        all_embeddings = []
+        print(f"Generating embeddings for {len(texts_to_embed)} case summaries...")
+        try:
+            for i in tqdm(range(0, len(texts_to_embed), batch_size), desc="Embedding Batches"):
+                batch_texts = texts_to_embed[i:i+batch_size]
+                # Generate embeddings for the batch
+                batch_embeddings = self.model.encode(batch_texts, convert_to_numpy=True, device=self.device, show_progress_bar=False)
+                all_embeddings.append(batch_embeddings)
+            # Combine all batch embeddings
+            if not all_embeddings:
+                 print("Error: No embeddings were generated.")
+                 return None
+            final_embeddings = np.vstack(all_embeddings)
+            print(f"Generated embeddings with shape: {final_embeddings.shape}")
+        except Exception as e:
+            print(f"Error during embedding generation: {e}")
+            return None
+        # Convert to HF Dataset and add embeddings
+        try:
+            dataset = Dataset.from_pandas(processed_df)
+            # Ensure embeddings column is compatible (list of lists)
+            dataset = dataset.add_column('embeddings', final_embeddings.tolist())
+        except Exception as e:
+             print(f"Error converting to Hugging Face Dataset or adding embeddings: {e}")
+             return None
+        # Save processed dataset
+        try:
+            os.makedirs(output_path, exist_ok=True) # Ensure directory exists
+            dataset.save_to_disk(output_path)
+            print(f"Processed dataset saved successfully to {output_path}")
+        except Exception as e:
+             print(f"Error saving dataset to disk at {output_path}: {e}")
+             return None # Return None if saving failed
+        return dataset

src/evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import numpy as np
+from sklearn.metrics import ndcg_score
+from src.retriever import ClinicalCaseRetriever, DummyRetriever
+def retrieval_metrics(retriever_instance: ClinicalCaseRetriever, queries: list[str], gold_ids: list[str], k: int = 5) -> dict | None:
+    """
+    Calculates retrieval metrics for a set of queries.
+    Args:
+        retriever_instance: An initialized ClinicalCaseRetriever instance.
+        queries: A list of query strings.
+        gold_ids: A list of the expected 'case_id' strings for each query.
+        k: The number of top results to consider for Hit@k and NDCG@k.
+    Returns:
+        A dictionary containing Hit@k, MRR, and NDCG@k scores, or None on error.
+    """
+    # --- Initialization ---
+    hits, reciprocal_ranks, ndcgs = [], [], []
+    print(f"\nCalculating retrieval metrics for {len(queries)} queries (k={k})...")
+    # --- Process Each Query ---
+    for q_idx, (q, gold) in enumerate(zip(queries, gold_ids)):
+        print(f"\nProcessing query {q_idx+1}/{len(queries)}: '{q}' (Expected ID: '{gold}')")
+        retrieved_cases, scores = retriever_instance.retrieve_relevant_case(q, top_k=k, return_scores=True)
+        # Safely extract IDs, handle missing keys
+        retrieved_ids = [c.get('case_id', 'N/A') for c in retrieved_cases]
+        print(f"Retrieved IDs: {retrieved_ids}")
+        print(f"Retrieved Scores: {[round(s, 4) for s in scores]}")
+        # --- Calculate Metrics ---
+        is_hit = int(gold in retrieved_ids)
+        hits.append(is_hit)
+        rank = 0
+        if is_hit:
+            rank = retrieved_ids.index(gold) + 1
+            reciprocal_ranks.append(1.0 / rank)
+        else:
+            reciprocal_ranks.append(0.0)
+        # NDCG calculation
+        true_relevance = np.asarray([[1.0 if gid == gold else 0.0 for gid in retrieved_ids]])
+        predicted_scores = np.asarray([scores])
+        current_ndcg = 0.0
+        if true_relevance.shape[1] > 0:
+            ndcg_k = min(k, true_relevance.shape[1]) # Ensure k is not out of bounds
+            current_ndcg = ndcg_score(true_relevance, predicted_scores, k=ndcg_k)
+        ndcgs.append(current_ndcg)
+        print(f"Hit: {is_hit}, Rank: {rank if rank > 0 else 'N/A'}, NDCG@{k}: {current_ndcg:.4f}")
+    # --- Aggregate Results ---
+    avg_hit = np.mean(hits) if hits else 0.0
+    avg_mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0
+    avg_ndcg = np.mean(ndcgs) if ndcgs else 0.0
+    print(f"\n--- Overall Retrieval Results (k={k}) --- ")
+    print(f"Average Hit@{k}: {avg_hit:.4f}")
+    print(f"Average MRR:    {avg_mrr:.4f}") # Corrected spacing for alignment
+    print(f"Average NDCG@{k}: {avg_ndcg:.4f}")
+    return {f"Hit@{k}": avg_hit,
+            f"MRR":      avg_mrr,
+            f"NDCG@{k}": avg_ndcg}

src/evaluator.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+class AnswerEvaluator:
+    """Evaluates user answers against expected answers using an LLM."""
+    def __init__(self, model_id="meta-llama/Llama-3.2-3B-Instruct"):
+        print(f"Initializing AnswerEvaluator with model: {model_id}")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+                print("Set pad_token to eos_token")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+            self.model.eval()
+            self.device = self.model.device
+            print(f"AnswerEvaluator model loaded successfully on device: {self.device}")
+        except Exception as e:
+            print(f"Error initializing AnswerEvaluator model {model_id}: {e}")
+            raise
+    def evaluate_answer(self, user_answer, expected_answer, clinical_context=None):
+        """
+        Compare user answer to expected answer and provide feedback
+        Args:
+            user_answer: Examinee's response
+            expected_answer: Model answer from the dataset
+            clinical_context: Optional clinical context to consider
+        Returns:
+            Feedback string
+        """
+        context_str = f"Clinical context: {clinical_context}\n\n" if clinical_context else ""
+        prompt = f"""<s>[INST] You are acting as an expert examiner for the American Board of Surgery (ABS) oral board exam. You are evaluating a general surgery resident’s answer to a clinical question. \n
+        Compare the answer provided by the residents to the correct expected answer, which I will provide you with. \n
+        Use the grading rubric below to assess their response:
+        [RUBRIC]
+        - Correct: Resident includes all major points and clinical reasoning aligns closely with the expected answer.
+        - Partially Correct: Resident includes some key points but omits others, or reasoning is partially flawed.
+        - Incorrect: Resident misses most key points or demonstrates incorrect reasoning.
+        {context_str}Here is the model answer that contains the key points expected from the resident:
+        {expected_answer}
+        Now, here is the resident’s actual response:
+        {user_answer}
+        Evaluate the resident’s response based **only** on the expected answer above. Do not rely on external knowledge or previous responses.
+        Focus your evaluation on:
+        1. Which key points were mentioned vs. missed
+        2. The accuracy and clarity of the clinical reasoning
+        3. Any major omissions or misunderstandings
+        Start your output with:
+        ASSESSMENT: [Correct / Partially Correct / Incorrect]
+        Then write 1–2 clear, specific sentences explaining how the resident’s response compares to the expected answer.
+        [EXAMPLE 1]
+        Expected answer:
+        "The differential diagnosis includes acute appendicitis, mesenteric adenitis, gastroenteritis, UTI, and testicular torsion."
+        Resident’s response:
+        "My top concern is appendicitis, but I’d also consider things like gastroenteritis or maybe even kidney stones."
+        ASSESSMENT: Partially Correct
+        The resident mentioned appendicitis and gastroenteritis but missed several other expected differentials like UTI, testicular torsion, and mesenteric adenitis.
+        [EXAMPLE 2]
+        Expected answer:
+        "Initial labs should include CBC, CMP, lipase, and abdominal ultrasound to assess for gallstones."
+        Resident’s response:
+        "I’d start with a full workup including CBC, liver enzymes, lipase, and an abdominal ultrasound."
+        ASSESSMENT: Correct
+        The resident included all key labs and the correct imaging modality. Their reasoning aligns well with the expected answer.
+        [/INST]</s>"""
+        try:
+            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device) # Added truncation
+            with torch.no_grad():
+                # Generate feedback using the model
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=150,
+                    do_sample=True,
+                    temperature = 0.2,
+                    pad_token_id=self.tokenizer.eos_token_id # Ensure pad token ID is set
+                )
+            prompt_length_tokens = inputs.input_ids.shape[1]
+            generated_ids = outputs[0][prompt_length_tokens:]
+            feedback = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+            return feedback
+        except Exception as e:
+            print(f"Error during LLM evaluation: {e}")
+            return "Error: Could not generate feedback."

src/retriever.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from datasets import load_from_disk, Dataset
+import os
+import pandas as pd
+class ClinicalCaseRetriever:
+    """Retrieves relevant clinical cases based on user input using sentence-transformers embeddings."""
+    def __init__(self, dataset_path='./processed_clinical_cases', model_name="all-MiniLM-L6-v2"):
+        print(f"Initializing ClinicalCaseRetriever with model: {model_name}")
+        if isinstance(dataset_path, Dataset):
+            self.dataset = dataset_path
+            print("Using provided Hugging Face Dataset object.")
+        elif isinstance(dataset_path, str) and os.path.isdir(dataset_path):
+             try:
+                self.dataset = load_from_disk(dataset_path)
+                print(f"Dataset loaded successfully from disk: {dataset_path}")
+             except Exception as e:
+                 print(f"Error loading dataset from disk {dataset_path}: {e}")
+                 raise ValueError("Failed to load dataset.") from e
+        else:
+            raise ValueError(f"Invalid dataset_path: Must be a Dataset object or a valid directory path. Got: {dataset_path}")
+        if 'embeddings' not in self.dataset.column_names:
+             raise ValueError("Dataset must contain an 'embeddings' column.")
+        print(f"Dataset features: {self.dataset.features}")
+        print(f"Number of cases in dataset: {len(self.dataset)}")
+        self.model = SentenceTransformer(model_name)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        self.model.to(self.device)
+        try:
+            # Ensure embeddings are loaded as a NumPy array
+            self.case_embeddings = np.array(self.dataset['embeddings'])
+            if self.case_embeddings.ndim != 2:
+                 raise ValueError(f"Embeddings array must be 2-dimensional. Got shape: {self.case_embeddings.shape}")
+            print(f"Loaded {len(self.dataset)} cases with embeddings of shape {self.case_embeddings.shape}")
+        except Exception as e:
+            print(f"Error processing embeddings from dataset: {e}")
+            raise ValueError("Failed to load or process embeddings.") from e
+    def get_available_cases(self, n=5):
+        """Return a sample of available cases for user selection."""
+        num_cases = len(self.dataset)
+        if num_cases == 0:
+            return []
+        sample_size = min(n, num_cases)
+        indices = np.random.choice(num_cases, sample_size, replace=False)
+        # Ensure indices are int for slicing dataset
+        return [(int(i), self.dataset[int(i)]['clinical_presentation']) for i in indices]
+    def encode_query(self, query):
+        """Generate embedding for a query string."""
+        # Create a better search query structure
+        search_query = f"Clinical case about {query}"
+        print(f"Encoding query: '{search_query}'")
+        # Generate embedding using sentence-transformers
+        try:
+            query_embedding = self.model.encode([search_query], convert_to_numpy=True, device=self.device, show_progress_bar=False)
+            return query_embedding
+        except Exception as e:
+             print(f"Error encoding query '{query}': {e}")
+             return None # Or raise an error
+    def retrieve_relevant_case(self, query, top_k=1, return_scores=False):
+        """Find the most relevant clinical case(s) given a query."""
+        if not isinstance(top_k, int) or top_k < 1:
+             print("Warning: top_k must be a positive integer. Defaulting to 1.")
+             top_k = 1
+        # Get query embedding
+        query_embedding = self.encode_query(query)
+        if query_embedding is None:
+             return [] if not return_scores else ([], [])
+        # Calculate similarity scores
+        try:
+            similarities = cosine_similarity(query_embedding, self.case_embeddings)[0] # Get the single row of similarities
+        except Exception as e:
+            print(f"Error calculating cosine similarity: {e}")
+            return [] if not return_scores else ([], [])
+        # Get indices of top-k most similar cases
+        # Ensure we don't request more indices than available cases
+        k_actual = min(top_k, len(similarities))
+        if k_actual == 0: # Should not happen if dataset loaded, but safe check
+             return [] if not return_scores else ([], [])
+        # Use partitioning for efficiency if k is much smaller than N, or argsort otherwise
+        # Using argsort is generally simpler and fine for moderate N
+        top_indices = np.argsort(similarities)[-k_actual:][::-1].astype(int) # Get top k indices, sorted descending
+        top_scores = similarities[top_indices].tolist() # Get scores for these indices
+        # Return the most relevant case(s)
+        try:
+            # Retrieve cases safely using integer indices
+            retrieved_cases = [self.dataset[int(idx)] for idx in top_indices]
+        except IndexError as e:
+             print(f"Error retrieving cases using indices {top_indices}: {e}")
+             return [] if not return_scores else ([], [])
+        except Exception as e:
+              print(f"Unexpected error retrieving cases: {e}")
+              return [] if not return_scores else ([], [])
+        results_with_scores = list(zip(retrieved_cases, top_scores))
+        print(f"Retrieved {len(results_with_scores)} cases with similarity scores:")
+        for case, score in results_with_scores:
+            # Safely access presentation, provide default if missing
+            presentation = case.get('clinical_presentation', 'Unknown Presentation')
+            print(f"- {presentation}: {score:.4f}")
+        if return_scores:
+            return retrieved_cases, top_scores
+        else:
+            # Return list of tuples (case_dict, score)
+            return results_with_scores
+class DummyRetriever:
+    """A simple retriever that bypasses RAG, taking a pre-formatted DataFrame."""
+    def __init__(self, df):
+        self.dataset = []
+        if not isinstance(df, pd.DataFrame) or df.empty:
+             print("Warning: DummyRetriever initialized with empty or invalid DataFrame.")
+             return
+        # Expects df to be pre-processed with columns:
+        # 'clinical_presentation', 'turn_id', 'question', 'answer'
+        required_cols = ['clinical_presentation', 'turn_id', 'question', 'answer']
+        if not all(col in df.columns for col in required_cols):
+            print(f"Warning: DummyRetriever DataFrame missing required columns. Need: {required_cols}")
+            return
+        grouped = df.groupby('clinical_presentation')
+        print(f"DummyRetriever processing {len(grouped)} unique presentations.")
+        for i, (scenario, group) in enumerate(grouped):
+            group_sorted = group.sort_values('turn_id')
+            case_dict = {
+                "case_id": group_sorted['case_id'].iloc[0] if 'case_id' in group_sorted.columns else f"dummy_{i}",
+                "clinical_presentation": scenario,
+                "questions": group_sorted["question"].tolist(),
+                "answers": group_sorted["answer"].tolist()
+            }
+            self.dataset.append(case_dict)
+        print(f"DummyRetriever initialized with {len(self.dataset)} cases.")
+    def retrieve_relevant_case(self, scenario_query, top_k=1):
+        """
+        Finds the case matching the query string exactly.
+        Ignores 'top_k' but mimics the return structure [(case_dict, score)].
+        """
+        print(f"DummyRetriever searching for exact match: '{scenario_query}'")
+        for case_dict in self.dataset:
+            if case_dict["clinical_presentation"] == scenario_query:
+                print(f"DummyRetriever found match: {case_dict['clinical_presentation']}")
+                return [(case_dict, 1.0)]
+        print(f"DummyRetriever: No exact match found for '{scenario_query}'")
+        return []

src/simulator.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import json
+import pandas as pd
+# Assuming retriever and evaluator classes are in these files:
+from .retriever import ClinicalCaseRetriever, DummyRetriever
+from .evaluator import AnswerEvaluator
+class OralExamSimulator:
+    """Main class that coordinates the oral board exam simulation."""
+    def __init__(self, retriever, evaluator):
+        if not isinstance(retriever, (ClinicalCaseRetriever, DummyRetriever)):
+             raise TypeError("Retriever must be an instance of ClinicalCaseRetriever or DummyRetriever")
+        if not isinstance(evaluator, AnswerEvaluator):
+             raise TypeError("Evaluator must be an instance of AnswerEvaluator")
+        self.retriever = retriever
+        self.evaluator = evaluator
+        self.current_case = None
+        self.current_question_idx = 0
+        self.session_history = []
+    def start_new_case(self, clinical_query=None, case_idx=None):
+        """
+        Initialize a new exam case based on query or direct selection.
+        Args:
+            clinical_query (str, optional): Text description of the desired case topic.
+            case_idx (int, optional): Direct index of the case to use from the retriever's dataset.
+        Returns:
+            dict: Contains case info and first question, or an error message.
+        """
+        print("-" * 50)
+        print(f"Attempting to start new case | Query: '{clinical_query}' | Index: {case_idx}")
+        # Reset state for the new case
+        self.current_case = None
+        self.current_question_idx = 0
+        self.session_history = []
+        # Case selection logic
+        retrieved_info = None # Use a temporary variable
+        if case_idx is not None:
+            try:
+                # Direct case selection by index
+                # Ensure index is valid
+                if 0 <= int(case_idx) < len(self.retriever.dataset):
+                    self.current_case = self.retriever.dataset[int(case_idx)]
+                    similarity_score = 1.0 # Direct selection implies perfect 'match'
+                    print(f"Selected case by index {case_idx}: {self.current_case.get('clinical_presentation', 'Unknown Presentation')}")
+                    retrieved_info = (self.current_case, similarity_score)
+                else:
+                    print(f"Error: Invalid case index {case_idx}. Must be between 0 and {len(self.retriever.dataset)-1}.")
+                    return {"error": f"Invalid case index: {case_idx}"}
+            except Exception as e:
+                print(f"Error selecting case by index {case_idx}: {e}")
+                return {"error": f"Failed to select case by index: {e}"}
+        elif clinical_query:
+            # RAG-based retrieval
+            try:
+                 # retrieve_relevant_case now returns a list of tuples: [(case_dict, score), ...]
+                 retrieved_results = self.retriever.retrieve_relevant_case(clinical_query, top_k=1)
+                 if retrieved_results: # Check if list is not empty
+                     retrieved_info = retrieved_results[0] # Get the first tuple (case_dict, score)
+                     self.current_case = retrieved_info[0]
+                     similarity_score = retrieved_info[1]
+                     print(f"Retrieved case via query ('{clinical_query}') with score {similarity_score:.4f}: {self.current_case.get('clinical_presentation', 'Unknown Presentation')}")
+                 else:
+                      print(f"Error: No case found for query: '{clinical_query}'")
+                      return {"error": f"No relevant case found for query: {clinical_query}"}
+            except Exception as e:
+                 print(f"Error retrieving case for query '{clinical_query}': {e}")
+                 return {"error": f"Failed to retrieve case by query: {e}"}
+        else:
+            # No selection method provided
+            print("Error: Must provide either a clinical query or a case index.")
+            return {"error": "Please provide either a clinical query or case index."}
+        # --- Post-selection setup ---
+        if self.current_case is None:
+             # This should ideally be caught above, but double-check
+             print("Error: Failed to set current_case.")
+             return {"error": "Failed to load the selected case."}
+        # Validate case structure
+        if 'questions' not in self.current_case or 'answers' not in self.current_case or \
+           not isinstance(self.current_case['questions'], list) or \
+           not isinstance(self.current_case['answers'], list) or \
+           len(self.current_case['questions']) != len(self.current_case['answers']):
+            print(f"Error: Invalid case structure for case ID {self.current_case.get('case_id', 'N/A')}. Mismatched or missing Q/A lists.")
+            return {"error": "Selected case has invalid format."}
+        if not self.current_case['questions']:
+             print(f"Warning: Selected case ID {self.current_case.get('case_id', 'N/A')} has no questions.")
+             # Decide how to handle this - error or proceed? Let's return an error for now.
+             return {"error": "Selected case contains no questions."}
+        # Start a new session record
+        self.session_history.append({
+            "role": "system",
+            "content": f"Clinical scenario started: {self.current_case.get('clinical_presentation', 'Unknown Presentation')} (Case ID: {self.current_case.get('case_id', 'N/A')})"
+        })
+        # Get the first question
+        first_question = self.current_case['questions'][0]
+        # Record this interaction
+        self.session_history.append({
+            "role": "examiner",
+            "content": first_question
+        })
+        print(f"Case successfully started. Total questions: {len(self.current_case['questions'])}")
+        print("-" * 50)
+        return {
+            "case_id": self.current_case.get('case_id', 'unknown'),
+            "clinical_presentation": self.current_case.get('clinical_presentation', 'Unknown'),
+            "similarity_score": similarity_score, # Use the score from retrieval
+            "current_question": first_question,
+            "question_number": 1,
+            "total_questions": len(self.current_case['questions'])
+        }
+    def process_user_response(self, response):
+        """
+        Process the user's answer, get feedback, and return the next question or completion status.
+        Args:
+            response (str): User's answer text.
+        Returns:
+            dict: Contains feedback, expected answer, completion status, and next question (if applicable), or an error message.
+        """
+        if self.current_case is None:
+            print("Error: No active case.")
+            return {"error": "No active case. Please start a new case first."}
+        if self.current_question_idx >= len(self.current_case['questions']):
+             print("Error: Attempting to process response when case is already complete.")
+             return {"error": "Case already completed."}
+        print("-" * 50)
+        current_q_num = self.current_question_idx + 1
+        total_q = len(self.current_case['questions'])
+        print(f"Processing response for Question {current_q_num}/{total_q}")
+        print(f"User Response: {response}")
+        # Save the user's response to history
+        self.session_history.append({
+            "role": "resident",
+            "content": response
+        })
+        # Get the expected answer for the current question
+        expected_answer = self.current_case['answers'][self.current_question_idx]
+        print(f"Expected Answer: {expected_answer}")
+        # Evaluate the answer
+        feedback = self.evaluator.evaluate_answer(
+            response,
+            expected_answer,
+            clinical_context = f"Regarding the case '{self.current_case.get('clinical_presentation', 'N/A')}'"
+        )
+        print(f"Generated Feedback: {feedback}")
+        # Add feedback to history
+        self.session_history.append({
+            "role": "feedback",
+            "content": feedback
+        })
+        # Move to the next question index
+        self.current_question_idx += 1
+        # Check if the case is complete
+        is_complete = self.current_question_idx >= len(self.current_case['questions'])
+        result = {
+            "feedback": feedback,
+            "expected_answer": expected_answer,
+            "is_complete": is_complete,
+            "question_number": self.current_question_idx
+        }
+        # Add next question if not complete
+        if not is_complete:
+            next_question = self.current_case['questions'][self.current_question_idx]
+            result["next_question"] = next_question
+            result["total_questions"] = total_q
+            # Add next question to history
+            self.session_history.append({
+                "role": "examiner",
+                "content": next_question
+            })
+            print(f"Next question ({result['question_number']}/{total_q}): {next_question}")
+        else:
+            print("Case completed.")
+            summary = self.generate_session_summary()
+            result["session_summary"] = summary
+            self.session_history.append({
+                 "role": "system",
+                 "content": "End of clinical scenario."
+            })
+        print("-" * 50)
+        return result
+    def generate_session_summary(self):
+        """Generate a summary dictionary of the completed session."""
+        if not self.current_case or not self.session_history:
+             return {"error": "No active or completed session to summarize."}
+        # Simple summary structure
+        return {
+            "case_id": self.current_case.get('case_id', 'N/A'),
+            "case": self.current_case.get('clinical_presentation', 'Unknown'),
+            "total_questions_in_case": len(self.current_case.get('questions', [])),
+            "interaction_history": self.session_history # Include the full log
+        }
+    def save_session(self, filepath):
+        """Save the current session summary to a JSON file."""
+        summary = self.generate_session_summary()
+        if "error" in summary:
+            print(f"Error generating summary for saving: {summary['error']}")
+            return {"error": "No session to save"}
+        try:
+             # Add a timestamp to the saved data
+             summary["timestamp"] = pd.Timestamp.now().isoformat()
+             # Ensure directory exists
+             os.makedirs(os.path.dirname(filepath), exist_ok=True)
+             with open(filepath, 'w') as f:
+                 json.dump(summary, f, indent=2)
+             print(f"Session saved successfully to {filepath}")
+             return {"status": "Session saved successfully"}
+        except Exception as e:
+             print(f"Error saving session to {filepath}: {e}")
+             return {"error": f"Failed to save session: {e}"}

src/synthetic_generator.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import pandas as pd
+import numpy as np
+import torch
+import re
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def generate_synthetic_case(clinical_query, model_id="meta-llama/Llama-3.2-3B-Instruct", max_tokens=800):
+    """Generate a synthetic clinical case with examiner questions and expected answers."""
+    print(f"Generating synthetic case for '{clinical_query}' using {model_id}...")
+    gen_tokenizer = None
+    gen_model = None
+    try:
+        # Initialize generator model components
+        gen_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        gen_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        gen_model.eval()
+        device = gen_model.device
+        if gen_tokenizer.pad_token is None:
+            gen_tokenizer.pad_token = gen_tokenizer.eos_token
+    except Exception as e:
+        print(f"Error initializing generator model {model_id}: {e}")
+        return None
+    prompt = f"""<s>[INST] You are a board-certified general surgeon simulating a clinical oral board exam.
+    Create a synthetic case on the topic: "{clinical_query}".
+    Start by describing the initial clinical presentation in 1–2 sentences.
+    Then generate a list of 5–8 examiner questions (Q1, Q2...), each paired with the expected examinee answer (A1, A2...). Ensure Q/A pairs are clearly separated.
+    Output ONLY the presentation and Q&A pairs in this exact format:
+    Clinical Presentation: ...
+    Q1: ...
+    A1: ...
+    Q2: ...
+    A2: ...
+    (continue until Qn/An)
+    Focus on common scenarios and standard knowledge. Avoid overly complex or rare details.
+    [/INST]</s>"""
+    output_text = None
+    try:
+        inputs = gen_tokenizer(prompt, return_tensors="pt").to(device)
+        input_ids_length = inputs.input_ids.shape[1]
+        with torch.no_grad():
+            outputs = gen_model.generate(
+                inputs.input_ids,
+                max_new_tokens=max_tokens,
+                do_sample=True, # Sample to get potentially varied outputs
+                temperature=0.7,
+                top_p=0.9,
+                pad_token_id=gen_tokenizer.eos_token_id
+            )
+        # Decode only the newly generated tokens
+        generated_ids = outputs[0][input_ids_length:]
+        output_text = gen_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        print("Synthetic case generation complete.")
+    except Exception as e:
+        print(f"Error during synthetic case generation: {e}")
+    finally:
+        # Clean up model resources
+        del gen_model
+        del gen_tokenizer
+        if torch.cuda.is_available():
+             torch.cuda.empty_cache()
+        return output_text
+def process_synthetic_data(clinical_query, output_text):
+    """Process the raw LLM output text into a structured DataFrame for the DummyRetriever."""
+    # Extract clinical presentation
+    match = re.search(r"Clinical Presentation:(.*?)(?=\n\nQ1:|$)", output_text, re.DOTALL | re.IGNORECASE)
+    clinical_presentation_text = match.group(1).strip() if match else "Synthetic Case: " + clinical_query
+    # Extract Q&A pairs
+    qa_pattern = r"Q(\d+):\s*(.*?)\s*A\1:\s*(.*?)(?=\n*Q\d+:|\Z)"
+    qa_matches = re.findall(qa_pattern, output_text, flags=re.DOTALL | re.IGNORECASE)
+    qa_list = []
+    for match_tuple in qa_matches:
+        try:
+            q_num = int(match_tuple[0])
+            q_text = match_tuple[1].strip()
+            a_text = match_tuple[2].strip()
+            if q_text and a_text:
+                qa_list.append({'turn_id': q_num, 'question': q_text, 'answer': a_text})
+        except (IndexError, ValueError) as e:
+             print(f"Warning: Skipping malformed Q/A match: {match_tuple} due to {e}")
+    if not qa_list:
+        print("Warning: No valid Q&A pairs extracted from synthetic text.")
+        return pd.DataFrame()
+    qa_list.sort(key=lambda item: item['turn_id'])
+    rows = []
+    for item in qa_list:
+        rows.append({
+            'case_id': 'SYNTH_01',
+            'clinical_presentation': clinical_query, # Use query as presentation title
+            'turn_id': item['turn_id'],
+            'question': item['question'],
+            'answer': item['answer']
+        })
+    df_synthetic = pd.DataFrame(rows)
+    if not df_synthetic.empty and clinical_presentation_text:
+         # Find the index of the first turn
+         first_turn_index = df_synthetic[df_synthetic['turn_id'] == 1].index
+         if not first_turn_index.empty:
+              idx = first_turn_index[0]
+              df_synthetic.loc[idx, 'question'] = clinical_presentation_text + " " + df_synthetic.loc[idx, 'question']
+         else:
+              print("Warning: Could not find turn_id 1 to prepend presentation.")
+    print(f"Processed synthetic data into DataFrame with {len(df_synthetic)} turns.")
+    return df_synthetic