""" VoiceNote AI - VIPS Classifier Three SEPARATE prompt strategies for proper experimental comparison. Methodology note: Earlier versions combined all three strategies into a single API call to reduce latency. This caused output priming - the model's first classification influenced the subsequent ones, masking real differences between strategies. Three separate API calls now ensure independent evaluation. """ import logging import re from config import Config logger = logging.getLogger(__name__) # ========================================================== # SHARED COMPONENTS # ========================================================== _VIPS_DEFINITIONS = """VIPS categories: V (Valbefinnande): pain, fatigue, nausea, dizziness, sleep, mood, anxiety, appetite, physical symptoms I (Integritet): living situation, mobility needs, habits, social support, preferences P (Prevention): mobilization plans, lifestyle factors, follow-up, physiotherapy S (Sakerhet): fall risk, allergies, medications, postoperative risks, infection risk""" _RULES = """RULES: - Write output in Swedish only - Do NOT quote the conversation - reformulate as professional clinical documentation - Write "Ingen relevant information." if a category has zero relevant content - Never invent information not stated in the conversation - Output the four VIPS lines in plain text, no markdown, no numbering""" # ========================================================== # THREE INDEPENDENT PROMPT STRATEGIES # ========================================================== def build_prompt_zero_shot(text: str) -> str: """Strategy 1: Pure task instruction with explicit format anchor.""" return f"""You are a Swedish clinical documentation specialist. {_VIPS_DEFINITIONS} {_RULES} Output format (exactly these 4 lines, no preamble, no markdown): V (Valbefinnande): [Swedish content] I (Integritet): [Swedish content] P (Prevention): [Swedish content] S (Sakerhet): [Swedish content] Conversation: {text} V (Valbefinnande):""" def build_prompt_few_shot(text: str) -> str: """Strategy 2: Two complete input-to-output examples.""" return f"""You are a Swedish clinical documentation specialist. {_VIPS_DEFINITIONS} {_RULES} Here are two complete examples to learn from: Example 1: Conversation: Patient reports knee pain 7/10 since yesterday. Lives alone in apartment, no family nearby. Allergic to penicillin. No mobility aids. V (Valbefinnande): Smarta 7/10 i hoger knaled sedan igar. I (Integritet): Bor ensam i lagenhet utan nara familj. P (Prevention): Ingen relevant information. S (Sakerhet): Kand penicillinallergi. Inga hjalpmedel. Example 2: Conversation: Patient anxious, hasn't slept in a week. Daughter helps with daily tasks. Takes 6 medications but cannot name them. Fell once last week. V (Valbefinnande): Angest och somnsvarigheter sedan en vecka. I (Integritet): Dotter assisterar med dagliga aktiviteter. P (Prevention): Behov av lakemedelsgenomgang och fallpreventionsbedomning. S (Sakerhet): Tar 6 lakemedel utan kannedom om namn. Tidigare fall senaste veckan. Now classify this conversation in the same format: Conversation: {text} V (Valbefinnande): I (Integritet): P (Prevention): S (Sakerhet):""" def build_prompt_chain_of_thought(text: str) -> str: """Strategy 3: Brief reasoning, then final note with reliable marker.""" return f"""You are a Swedish clinical documentation specialist. {_VIPS_DEFINITIONS} {_RULES} Reason BRIEFLY (max 3-4 short bullet points per step), then write the final note. Conversation: {text} REASONING: - Clinical details mentioned: (list briefly) - Category assignments (V/I/P/S): (one short sentence per detail) - Verification - only stated info, nothing invented: (yes/no) ===FINAL NOTE=== V (Valbefinnande): I (Integritet): P (Prevention): S (Sakerhet):""" # ========================================================== # ROBUST PARSER # ========================================================== def parse_vips_response(response: str) -> dict: """Parse VIPS response - robust to markdown, list markers, prefixes.""" default = "Ingen relevant information." vips = {"V": default, "I": default, "P": default, "S": default} # For CoT: extract section after final-note marker for marker in ["===FINAL NOTE===", "FINAL NOTE", "Final Note", "STEP 4", "===FINAL===", "FINAL:"]: if marker in response: response = response.split(marker)[-1] break # Match Swedish category names (handles both A and aa-style typing) full_names = { "V": r"V[aä]lbefinnande", "I": r"Integritet", "P": r"Prevention", "S": r"S[aä]kerhet", } for key, full in full_names.items(): # Try patterns from most specific to most lenient patterns = [ rf'(?im)^[\s\*\-\d\.]*{key}\s*\({full}\)\s*[:\-]\s*\**\s*(.+?)\s*\**\s*$', rf'(?im)^[\s\*\-\d\.]*{full}\s*[:\-]\s*\**\s*(.+?)\s*\**\s*$', rf'(?im)^[\s\*\-\d\.]*{key}\s*[:\-]\s*\**\s*(.+?)\s*\**\s*$', ] for pattern in patterns: match = re.search(pattern, response) if match: content = match.group(1).strip().strip('*').strip() # Skip placeholder text if content and content.lower() not in ['[swedish content]', 'content', '...']: vips[key] = content break return vips def format_vips_for_display(vips: dict) -> str: labels = {"V": "V (Valbefinnande)", "I": "I (Integritet)", "P": "P (Prevention)", "S": "S (Sakerhet)"} return "\n".join(f"{labels[k]}: {vips.get(k, 'Ingen relevant information.')}" for k in ["V", "I", "P", "S"]) # ========================================================== # MAIN - three SEPARATE API calls + DEBUG LOGGING # ========================================================== def classify_all(english_text: str, mistral_client) -> dict: """ Run three independent VIPS classifications via separate API calls. Why separate calls instead of one combined prompt: - Avoids output priming (model copying its own previous output) - Each strategy gets a fresh context - Results become scientifically comparable Trade-off: ~3x latency, but methodologically sound. """ logger.info("Running 3 independent prompt strategies...") results = {} strategies = { "zero_shot": (build_prompt_zero_shot, 500), "few_shot": (build_prompt_few_shot, 500), "chain_of_thought": (build_prompt_chain_of_thought, 1200), } for name, (builder, max_tok) in strategies.items(): try: logger.info(f" -> {name} (max_tokens={max_tok})...") raw = mistral_client.generate( prompt=builder(english_text), max_tokens=max_tok, temperature=0.15, ) # DEBUG: log first 300 chars of raw response logger.info(f" [RAW] {name} (first 300 chars): {raw[:300]}") parsed = parse_vips_response(raw) # Sanity check: warn if all categories are empty if all(v == "Ingen relevant information." for v in parsed.values()): logger.warning(f" [WARN] {name} parsed empty - check raw output above") results[name] = parsed logger.info(f" [OK] {name} done ({len(raw)} chars)") except Exception as e: logger.error(f" [ERR] {name} failed: {e}") results[name] = {k: f"[FEL: {e}]" for k in ["V", "I", "P", "S"]} return results