Spaces:

dedp
/

harbor

Sleeping

Lyonel Tanganco

cleanup

d6c18ca 2 months ago

7.7 kB

	import json
	import re


	def load_schema(schema_path):
	"""Load the user profile schema from a JSON file."""
	with open(schema_path, 'r', encoding='utf-8') as f:
	return json.load(f)


	def create_empty_profile():
	"""
	Create an empty user profile with all fields set to null/empty.
	This represents a user we know nothing about yet.
	"""
	return {
	"demographics": {
	"population": None,
	"identity_factors": [],
	"language": None,
	"pronouns": None
	},
	"logistics": {
	"zipcode": None,
	"region": None,
	"profession": None,
	"accessibility_needs": [],
	"insurance": None,
	"treatment_history": None
	},
	"status": {
	"current_state": None,
	"crisis_level": None,
	"temporary_factors": []
	},
	"clinical": {
	"primary_focus": None,
	"substances": []
	},
	"preferences": {
	"setting": None,
	"therapy_approach": None,
	"scheduling": [],
	"barriers": [],
	"contact_channel": None
	}
	}


	def extract_profile_updates(schema, user_input):
	"""
	Scan user input against the schema and return a dict of detected profile updates.

	For 'single' type fields, returns the first matched option value.
	For 'multi' type fields, returns a list of all matched option values.
	For 'extracted' type fields (zipcode, region, treatment_history), uses
	pattern matching or returns raw text snippets.

	Args:
	schema: The loaded profile schema dict.
	user_input: The user's message text.

	Returns:
	dict: Nested dict mirroring the profile structure, containing only
	fields where matches were found.
	"""
	input_lower = user_input.lower()
	updates = {}

	for category_name, category in schema.items():
	category_updates = {}

	for field_name, field_def in category.items():
	field_type = field_def.get("type")

	if field_type == "extracted":
	# Special handling for pattern-based or free-text fields
	value = _extract_field(field_name, field_def, user_input, input_lower)
	if value is not None:
	category_updates[field_name] = value

	elif field_type in ("single", "multi"):
	matches = []
	for option in field_def.get("options", []):
	for keyword in option.get("keywords", []):
	if keyword and keyword.lower() in input_lower:
	matches.append(option["value"])
	break # one keyword match per option is enough

	if matches:
	if field_type == "single":
	category_updates[field_name] = matches[0]
	else:
	category_updates[field_name] = matches

	if category_updates:
	updates[category_name] = category_updates

	return updates


	def _extract_field(field_name, field_def, user_input, input_lower):
	"""Handle extraction for non-option fields like zipcode and treatment_history."""
	if field_name == "zipcode":
	pattern = field_def.get("pattern", r"\b\d{5}\b")
	match = re.search(pattern, user_input)
	if match:
	return match.group()
	return None

	if field_name == "region":
	# Region is typically set explicitly or by the LLM, not keyword-matched.
	# We do a lightweight check for common geographic indicators.
	geo_patterns = [
	r"\bin\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "in Boston", "in Pocahontas County"
	r"\bnear\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "near Springfield"
	r"\bfrom\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "from Cambridge"
	]
	for pattern in geo_patterns:
	match = re.search(pattern, user_input)
	if match:
	return match.group(1)
	return None

	if field_name == "treatment_history":
	history_keywords = ["rehab", "treatment before", "been to", "tried",
	"previous treatment", "went to", "was in",
	"12-step", "residential before", "relapsed"]
	for keyword in history_keywords:
	if keyword in input_lower:
	return user_input # store the raw message as context
	return None

	return None


	def merge_profile(profile, updates):
	"""
	Merge new updates into the existing profile.

	- For 'single' fields (non-list values): new values overwrite old ones.
	- For 'multi' fields (list values): new values are appended (no duplicates).
	- None values in updates are ignored (don't clear existing data).

	Args:
	profile: The current user profile dict (modified in place).
	updates: The updates dict from extract_profile_updates().

	Returns:
	dict: The updated profile (same object as input).
	"""
	for category_name, category_updates in updates.items():
	if category_name not in profile:
	continue

	for field_name, new_value in category_updates.items():
	if field_name not in profile[category_name]:
	continue

	if new_value is None:
	continue

	existing = profile[category_name][field_name]

	if isinstance(existing, list) and isinstance(new_value, list):
	# Append new values, skip duplicates
	for v in new_value:
	if v not in existing:
	existing.append(v)
	elif isinstance(existing, list) and not isinstance(new_value, list):
	# Single value going into a list field
	if new_value not in existing:
	existing.append(new_value)
	else:
	# Single value field: overwrite
	profile[category_name][field_name] = new_value

	return profile


	def profile_to_summary(profile):
	"""
	Convert a user profile dict into a concise text summary for injection
	into the system prompt. Only includes fields that have been filled in.

	Returns:
	str: A human-readable summary, or empty string if profile is empty.
	"""
	lines = []

	category_labels = {
	"demographics": "Demographics",
	"logistics": "Logistics & History",
	"status": "Current Status",
	"clinical": "Clinical Needs",
	"preferences": "Preferences & Barriers"
	}

	for category_name, category_label in category_labels.items():
	category = profile.get(category_name, {})
	category_lines = []

	for field_name, value in category.items():
	if value is None:
	continue
	if isinstance(value, list) and len(value) == 0:
	continue

	# Format the field name nicely
	display_name = field_name.replace("_", " ").title()

	if isinstance(value, list):
	category_lines.append(f" - {display_name}: {', '.join(str(v) for v in value)}")
	else:
	category_lines.append(f" - {display_name}: {value}")

	if category_lines:
	lines.append(f"[{category_label}]")
	lines.extend(category_lines)

	if not lines:
	return ""

	header = (
	"USER PROFILE (already collected — DO NOT ask the user again for any of these details):\n"
	)
	return header + "\n".join(lines)