harbor / src /utils /profile.py
Lyonel Tanganco
cleanup
d6c18ca
import json
import re
def load_schema(schema_path):
"""Load the user profile schema from a JSON file."""
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
def create_empty_profile():
"""
Create an empty user profile with all fields set to null/empty.
This represents a user we know nothing about yet.
"""
return {
"demographics": {
"population": None,
"identity_factors": [],
"language": None,
"pronouns": None
},
"logistics": {
"zipcode": None,
"region": None,
"profession": None,
"accessibility_needs": [],
"insurance": None,
"treatment_history": None
},
"status": {
"current_state": None,
"crisis_level": None,
"temporary_factors": []
},
"clinical": {
"primary_focus": None,
"substances": []
},
"preferences": {
"setting": None,
"therapy_approach": None,
"scheduling": [],
"barriers": [],
"contact_channel": None
}
}
def extract_profile_updates(schema, user_input):
"""
Scan user input against the schema and return a dict of detected profile updates.
For 'single' type fields, returns the first matched option value.
For 'multi' type fields, returns a list of all matched option values.
For 'extracted' type fields (zipcode, region, treatment_history), uses
pattern matching or returns raw text snippets.
Args:
schema: The loaded profile schema dict.
user_input: The user's message text.
Returns:
dict: Nested dict mirroring the profile structure, containing only
fields where matches were found.
"""
input_lower = user_input.lower()
updates = {}
for category_name, category in schema.items():
category_updates = {}
for field_name, field_def in category.items():
field_type = field_def.get("type")
if field_type == "extracted":
# Special handling for pattern-based or free-text fields
value = _extract_field(field_name, field_def, user_input, input_lower)
if value is not None:
category_updates[field_name] = value
elif field_type in ("single", "multi"):
matches = []
for option in field_def.get("options", []):
for keyword in option.get("keywords", []):
if keyword and keyword.lower() in input_lower:
matches.append(option["value"])
break # one keyword match per option is enough
if matches:
if field_type == "single":
category_updates[field_name] = matches[0]
else:
category_updates[field_name] = matches
if category_updates:
updates[category_name] = category_updates
return updates
def _extract_field(field_name, field_def, user_input, input_lower):
"""Handle extraction for non-option fields like zipcode and treatment_history."""
if field_name == "zipcode":
pattern = field_def.get("pattern", r"\b\d{5}\b")
match = re.search(pattern, user_input)
if match:
return match.group()
return None
if field_name == "region":
# Region is typically set explicitly or by the LLM, not keyword-matched.
# We do a lightweight check for common geographic indicators.
geo_patterns = [
r"\bin\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "in Boston", "in Pocahontas County"
r"\bnear\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "near Springfield"
r"\bfrom\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # "from Cambridge"
]
for pattern in geo_patterns:
match = re.search(pattern, user_input)
if match:
return match.group(1)
return None
if field_name == "treatment_history":
history_keywords = ["rehab", "treatment before", "been to", "tried",
"previous treatment", "went to", "was in",
"12-step", "residential before", "relapsed"]
for keyword in history_keywords:
if keyword in input_lower:
return user_input # store the raw message as context
return None
return None
def merge_profile(profile, updates):
"""
Merge new updates into the existing profile.
- For 'single' fields (non-list values): new values overwrite old ones.
- For 'multi' fields (list values): new values are appended (no duplicates).
- None values in updates are ignored (don't clear existing data).
Args:
profile: The current user profile dict (modified in place).
updates: The updates dict from extract_profile_updates().
Returns:
dict: The updated profile (same object as input).
"""
for category_name, category_updates in updates.items():
if category_name not in profile:
continue
for field_name, new_value in category_updates.items():
if field_name not in profile[category_name]:
continue
if new_value is None:
continue
existing = profile[category_name][field_name]
if isinstance(existing, list) and isinstance(new_value, list):
# Append new values, skip duplicates
for v in new_value:
if v not in existing:
existing.append(v)
elif isinstance(existing, list) and not isinstance(new_value, list):
# Single value going into a list field
if new_value not in existing:
existing.append(new_value)
else:
# Single value field: overwrite
profile[category_name][field_name] = new_value
return profile
def profile_to_summary(profile):
"""
Convert a user profile dict into a concise text summary for injection
into the system prompt. Only includes fields that have been filled in.
Returns:
str: A human-readable summary, or empty string if profile is empty.
"""
lines = []
category_labels = {
"demographics": "Demographics",
"logistics": "Logistics & History",
"status": "Current Status",
"clinical": "Clinical Needs",
"preferences": "Preferences & Barriers"
}
for category_name, category_label in category_labels.items():
category = profile.get(category_name, {})
category_lines = []
for field_name, value in category.items():
if value is None:
continue
if isinstance(value, list) and len(value) == 0:
continue
# Format the field name nicely
display_name = field_name.replace("_", " ").title()
if isinstance(value, list):
category_lines.append(f" - {display_name}: {', '.join(str(v) for v in value)}")
else:
category_lines.append(f" - {display_name}: {value}")
if category_lines:
lines.append(f"[{category_label}]")
lines.extend(category_lines)
if not lines:
return ""
header = (
"USER PROFILE (already collected — DO NOT ask the user again for any of these details):\n"
)
return header + "\n".join(lines)