| # utils/metadata_utils.py | |
| from datetime import datetime | |
| from urllib.parse import urlparse | |
| def enhance_metadata(metadata): | |
| """ | |
| Enhance metadata with inferred fields based on known patterns. | |
| Args: | |
| metadata (dict): Original metadata dictionary | |
| Returns: | |
| dict: Enhanced metadata with additional standardized fields | |
| """ | |
| enhanced = metadata.copy() | |
| url = enhanced.get("url", "") | |
| parsed_url = urlparse(url) | |
| # Set institution and short name | |
| if "american.edu" in parsed_url.netloc: | |
| enhanced.setdefault("institution", "American University") | |
| enhanced.setdefault("institution_short", "AU") | |
| # Determine source system from subdomain or path | |
| if "catalog" in parsed_url.netloc: | |
| enhanced.setdefault("source_system", "Course Catalog") | |
| else: | |
| enhanced.setdefault("source_system", "University Website") | |
| # Standardized timestamp (ISO 8601) | |
| enhanced.setdefault("timestamp", datetime.now().isoformat()) | |
| return enhanced |