Spaces:

SsebaA
/

x

Running on Zero

App Files Files Community

x / vips_classifier.py

SsebaA

Update vips_classifier.py

106cfd6 verified 5 days ago

raw

history blame contribute delete

7.67 kB

	"""
	VoiceNote AI - VIPS Classifier
	Three SEPARATE prompt strategies for proper experimental comparison.

	Methodology note: Earlier versions combined all three strategies into a single
	API call to reduce latency. This caused output priming - the model's first
	classification influenced the subsequent ones, masking real differences between
	strategies. Three separate API calls now ensure independent evaluation.
	"""
	import logging
	import re
	from config import Config

	logger = logging.getLogger(__name__)


	# ==========================================================
	# SHARED COMPONENTS
	# ==========================================================

	_VIPS_DEFINITIONS = """VIPS categories:
	V (Valbefinnande): pain, fatigue, nausea, dizziness, sleep, mood, anxiety, appetite, physical symptoms
	I (Integritet): living situation, mobility needs, habits, social support, preferences
	P (Prevention): mobilization plans, lifestyle factors, follow-up, physiotherapy
	S (Sakerhet): fall risk, allergies, medications, postoperative risks, infection risk"""

	_RULES = """RULES:
	- Write output in Swedish only
	- Do NOT quote the conversation - reformulate as professional clinical documentation
	- Write "Ingen relevant information." if a category has zero relevant content
	- Never invent information not stated in the conversation
	- Output the four VIPS lines in plain text, no markdown, no numbering"""


	# ==========================================================
	# THREE INDEPENDENT PROMPT STRATEGIES
	# ==========================================================

	def build_prompt_zero_shot(text: str) -> str:
	"""Strategy 1: Pure task instruction with explicit format anchor."""
	return f"""You are a Swedish clinical documentation specialist.
	{_VIPS_DEFINITIONS}
	{_RULES}

	Output format (exactly these 4 lines, no preamble, no markdown):
	V (Valbefinnande): [Swedish content]
	I (Integritet): [Swedish content]
	P (Prevention): [Swedish content]
	S (Sakerhet): [Swedish content]

	Conversation:
	{text}

	V (Valbefinnande):"""


	def build_prompt_few_shot(text: str) -> str:
	"""Strategy 2: Two complete input-to-output examples."""
	return f"""You are a Swedish clinical documentation specialist.
	{_VIPS_DEFINITIONS}
	{_RULES}

	Here are two complete examples to learn from:

	Example 1:
	Conversation: Patient reports knee pain 7/10 since yesterday. Lives alone in apartment, no family nearby. Allergic to penicillin. No mobility aids.
	V (Valbefinnande): Smarta 7/10 i hoger knaled sedan igar.
	I (Integritet): Bor ensam i lagenhet utan nara familj.
	P (Prevention): Ingen relevant information.
	S (Sakerhet): Kand penicillinallergi. Inga hjalpmedel.

	Example 2:
	Conversation: Patient anxious, hasn't slept in a week. Daughter helps with daily tasks. Takes 6 medications but cannot name them. Fell once last week.
	V (Valbefinnande): Angest och somnsvarigheter sedan en vecka.
	I (Integritet): Dotter assisterar med dagliga aktiviteter.
	P (Prevention): Behov av lakemedelsgenomgang och fallpreventionsbedomning.
	S (Sakerhet): Tar 6 lakemedel utan kannedom om namn. Tidigare fall senaste veckan.

	Now classify this conversation in the same format:
	Conversation: {text}

	V (Valbefinnande):
	I (Integritet):
	P (Prevention):
	S (Sakerhet):"""


	def build_prompt_chain_of_thought(text: str) -> str:
	"""Strategy 3: Brief reasoning, then final note with reliable marker."""
	return f"""You are a Swedish clinical documentation specialist.
	{_VIPS_DEFINITIONS}
	{_RULES}

	Reason BRIEFLY (max 3-4 short bullet points per step), then write the final note.

	Conversation:
	{text}

	REASONING:
	- Clinical details mentioned: (list briefly)
	- Category assignments (V/I/P/S): (one short sentence per detail)
	- Verification - only stated info, nothing invented: (yes/no)

	===FINAL NOTE===
	V (Valbefinnande):
	I (Integritet):
	P (Prevention):
	S (Sakerhet):"""


	# ==========================================================
	# ROBUST PARSER
	# ==========================================================

	def parse_vips_response(response: str) -> dict:
	"""Parse VIPS response - robust to markdown, list markers, prefixes."""
	default = "Ingen relevant information."
	vips = {"V": default, "I": default, "P": default, "S": default}

	# For CoT: extract section after final-note marker
	for marker in ["===FINAL NOTE===", "FINAL NOTE", "Final Note",
	"STEP 4", "===FINAL===", "FINAL:"]:
	if marker in response:
	response = response.split(marker)[-1]
	break

	# Match Swedish category names (handles both A and aa-style typing)
	full_names = {
	"V": r"V[aä]lbefinnande",
	"I": r"Integritet",
	"P": r"Prevention",
	"S": r"S[aä]kerhet",
	}

	for key, full in full_names.items():
	# Try patterns from most specific to most lenient
	patterns = [
	rf'(?im)^[\s\\-\d\.]{key}\s${full}$\s[:\-]\s\\s(.+?)\s\\s$',
	rf'(?im)^[\s\\-\d\.]{full}\s[:\-]\s\*\s(.+?)\s\\s$',
	rf'(?im)^[\s\\-\d\.]{key}\s[:\-]\s\*\s(.+?)\s\\s$',
	]
	for pattern in patterns:
	match = re.search(pattern, response)
	if match:
	content = match.group(1).strip().strip('*').strip()
	# Skip placeholder text
	if content and content.lower() not in ['[swedish content]', 'content', '...']:
	vips[key] = content
	break
	return vips


	def format_vips_for_display(vips: dict) -> str:
	labels = {"V": "V (Valbefinnande)", "I": "I (Integritet)",
	"P": "P (Prevention)", "S": "S (Sakerhet)"}
	return "\n".join(f"{labels[k]}: {vips.get(k, 'Ingen relevant information.')}"
	for k in ["V", "I", "P", "S"])


	# ==========================================================
	# MAIN - three SEPARATE API calls + DEBUG LOGGING
	# ==========================================================

	def classify_all(english_text: str, mistral_client) -> dict:
	"""
	Run three independent VIPS classifications via separate API calls.

	Why separate calls instead of one combined prompt:
	- Avoids output priming (model copying its own previous output)
	- Each strategy gets a fresh context
	- Results become scientifically comparable

	Trade-off: ~3x latency, but methodologically sound.
	"""
	logger.info("Running 3 independent prompt strategies...")

	results = {}
	strategies = {
	"zero_shot": (build_prompt_zero_shot, 500),
	"few_shot": (build_prompt_few_shot, 500),
	"chain_of_thought": (build_prompt_chain_of_thought, 1200),
	}

	for name, (builder, max_tok) in strategies.items():
	try:
	logger.info(f" -> {name} (max_tokens={max_tok})...")
	raw = mistral_client.generate(
	prompt=builder(english_text),
	max_tokens=max_tok,
	temperature=0.15,
	)
	# DEBUG: log first 300 chars of raw response
	logger.info(f" [RAW] {name} (first 300 chars): {raw[:300]}")

	parsed = parse_vips_response(raw)

	# Sanity check: warn if all categories are empty
	if all(v == "Ingen relevant information." for v in parsed.values()):
	logger.warning(f" [WARN] {name} parsed empty - check raw output above")

	results[name] = parsed
	logger.info(f" [OK] {name} done ({len(raw)} chars)")
	except Exception as e:
	logger.error(f" [ERR] {name} failed: {e}")
	results[name] = {k: f"[FEL: {e}]" for k in ["V", "I", "P", "S"]}

	return results