eodi-mcp / src /utils /yaml_loader.py
lovelymango's picture
Upload 19 files
4c3c97b verified
"""
YAML Frontmatter ๊ธฐ๋ฐ˜ ์ง€์‹ ๋กœ๋”
================================
์ด๋ฏธ ๊ตฌ์กฐํ™”๋œ YAML frontmatter๊ฐ€ ์žˆ๋Š” ๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์„ ์ง์ ‘ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
LLM ํ˜ธ์ถœ ์—†์ด ๊ธฐ์กด ์ถ”์ถœ ๊ฒฐ๊ณผ๋ฅผ ํ™œ์šฉํ•ฉ๋‹ˆ๋‹ค.
"""
import yaml
import frontmatter
from pathlib import Path
from typing import Dict, Any, List, Optional
from datetime import datetime
def load_extracted_knowledge(file_path: str) -> Dict[str, Any]:
"""
๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์˜ YAML frontmatter์—์„œ extracted_knowledge๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
๋‘ ๊ฐ€์ง€ YAML ๊ตฌ์กฐ๋ฅผ ์ง€์›:
1. extracted_knowledge ๋ž˜ํผ ์•ˆ์— ์ค‘์ฒฉ๋œ ๊ตฌ์กฐ (KOR, Terms)
2. ์ตœ์ƒ์œ„ ๋ ˆ๋ฒจ์— credit_cards ๋“ฑ์ด ์ง์ ‘ ์žˆ๋Š” ๊ตฌ์กฐ (USA)
Args:
file_path: ๋งˆํฌ๋‹ค์šด ํŒŒ์ผ ๊ฒฝ๋กœ
Returns:
extracted_knowledge ๋”•์…”๋„ˆ๋ฆฌ ๋˜๋Š” ์ „์ฒด frontmatter
"""
# ์ง€์›ํ•˜๋Š” ์ง€์‹ ์œ ํ˜• ํ‚ค๋“ค
knowledge_keys = [
# ๊ธฐ์กด ํ‚ค๋“ค
'credit_cards', 'membership_tiers', 'loyalty_program',
'subscription_programs', 'points_system', 'milestone_program',
'best_price_guarantee', 'point_exclusions', 'general_policies',
'common_card_features', 'hilton_honors_references', 'card_comparison_summary',
'facts',
# ํ˜ธํ…” ํ”„๋กœํผํ‹ฐ ๊ด€๋ จ ํ‚ค๋“ค (Pullman, Fairmont, Hotel Naru ๋“ฑ)
'hotel_properties', 'hotel_facilities', 'room_types', 'tier_implementations',
'room_common_amenities', 'loyalty_program_features', 'pricing_analysis',
'ratings', 'nearby_attractions', 'channel_implementations', 'member_rates',
'dining_venues', 'room_service', 'policies', 'pros_cons', 'hotel_brands',
'benefits', 'promotion', 'exclusions', 'terms_and_conditions', 'points_policy'
]
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {file_path}")
with open(path, 'r', encoding='utf-8') as f:
post = frontmatter.load(f)
metadata = post.metadata
# extracted_knowledge๊ฐ€ ์žˆ์œผ๋ฉด ์‚ฌ์šฉ
if 'extracted_knowledge' in metadata:
ek = metadata['extracted_knowledge']
else:
# ์ตœ์ƒ์œ„ ๋ ˆ๋ฒจ์—์„œ ์ง€์‹ ํ‚ค๋“ค ์ˆ˜์ง‘
ek = {}
for key in knowledge_keys:
if key in metadata:
ek[key] = metadata[key]
# ์ง€์‹ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
if ek:
# ์ฒด์ธ ๊ฒฐ์ • (๋‹ค์–‘ํ•œ ์†Œ์Šค์—์„œ ์‹œ๋„)
chain = None
# 1. identity์—์„œ ํ™•์ธ
identity = metadata.get('identity', {})
chain = identity.get('chain')
# 2. document_reference ๋˜๋Š” document_ref์—์„œ ํ™•์ธ
if not chain:
doc_ref = metadata.get('document_reference', metadata.get('document_ref', {}))
if isinstance(doc_ref, dict):
if 'identity' in doc_ref:
chain = doc_ref['identity'].get('chain')
else:
chain = doc_ref.get('chain')
# 3. extracted_knowledge ๋‚ด๋ถ€์—์„œ ํ™•์ธ
if not chain:
if 'loyalty_program' in ek and ek['loyalty_program']:
chain = ek['loyalty_program'].get('chain')
elif 'points_system' in ek and ek['points_system']:
chain = ek['points_system'].get('chain')
elif 'credit_cards' in ek and ek['credit_cards']:
chain = ek['credit_cards'][0].get('chain')
elif 'membership_tiers' in ek and ek['membership_tiers']:
chain = ek['membership_tiers'][0].get('chain')
# 4. hotel_properties์—์„œ ์ฒด์ธ ํ™•์ธ (Pullman, Hotel Naru ๋“ฑ)
elif 'hotel_properties' in ek and ek['hotel_properties']:
chain = ek['hotel_properties'][0].get('chain')
# 5. tier_implementations์—์„œ ์ฒด์ธ ํ™•์ธ
elif 'tier_implementations' in ek and ek['tier_implementations']:
chain = ek['tier_implementations'][0].get('chain')
# identity์— chain ์ถ”๊ฐ€
if chain and not identity.get('chain'):
identity['chain'] = chain
return {
'file_path': str(file_path),
'identity': identity,
'source': metadata.get('source', {}),
'version': metadata.get('version', {}),
'extracted_knowledge': ek
}
else:
return {
'file_path': str(file_path),
'metadata': metadata,
'has_extracted_knowledge': False
}
def load_all_from_directory(
directory_path: str,
pattern: str = "**/*.md"
) -> List[Dict[str, Any]]:
"""
๋””๋ ‰ํ† ๋ฆฌ ๋‚ด ๋ชจ๋“  ๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์—์„œ extracted_knowledge๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
Args:
directory_path: ๋””๋ ‰ํ† ๋ฆฌ ๊ฒฝ๋กœ
pattern: ํŒŒ์ผ ํŒจํ„ด (๊ธฐ๋ณธ๊ฐ’: "**/*.md")
Returns:
[{'file_path': ..., 'extracted_knowledge': ...}, ...] ๋ฆฌ์ŠคํŠธ
"""
directory = Path(directory_path)
if not directory.exists():
raise FileNotFoundError(f"๋””๋ ‰ํ† ๋ฆฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {directory_path}")
results = []
files = list(directory.glob(pattern))
print(f"๐Ÿ“‚ {len(files)}๊ฐœ ํŒŒ์ผ ๋ฐœ๊ฒฌ: {directory_path}")
for file_path in files:
try:
data = load_extracted_knowledge(str(file_path))
if data.get('has_extracted_knowledge', True):
results.append(data)
print(f" โœ… {file_path.name}")
else:
print(f" โš ๏ธ {file_path.name} (extracted_knowledge ์—†์Œ)")
except Exception as e:
print(f" โŒ {file_path.name}: {e}")
continue
return results
def get_summary(data: Dict[str, Any]) -> Dict[str, Any]:
"""
๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ์˜ ์š”์•ฝ ์ •๋ณด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
"""
identity = data.get('identity', {})
ek = data.get('extracted_knowledge', {})
summary = {
'file_path': data.get('file_path'),
'chain': identity.get('chain'),
'program_name': identity.get('program_name'),
'doc_type': identity.get('doc_type'),
'extraction_timestamp': ek.get('extraction_timestamp'),
'extractor_model': ek.get('extractor_model'),
}
# ์ฝ˜ํ…์ธ  ์š”์•ฝ
if 'subscription_programs' in ek:
summary['subscription_programs_count'] = len(ek['subscription_programs'])
if 'membership_tiers' in ek:
summary['membership_tiers_count'] = len(ek['membership_tiers'])
if 'loyalty_program' in ek:
summary['has_loyalty_program'] = True
if 'credit_cards' in ek:
summary['credit_cards_count'] = len(ek['credit_cards'])
if 'benefits' in ek:
summary['benefits_count'] = len(ek['benefits'])
return summary
if __name__ == "__main__":
import json
print("๐Ÿงช YAML Frontmatter ๋กœ๋” ํ…Œ์ŠคํŠธ")
print("=" * 60)
# ๋””๋ ‰ํ† ๋ฆฌ ๋กœ๋“œ ํ…Œ์ŠคํŠธ
results = load_all_from_directory("data/raw/Hotel")
print(f"\n๐Ÿ“Š ๋กœ๋“œ ๊ฒฐ๊ณผ: {len(results)}๊ฐœ ํŒŒ์ผ")
print("=" * 60)
for data in results:
summary = get_summary(data)
print(f"\n๐Ÿ“„ {Path(summary['file_path']).name}")
print(f" ์ฒด์ธ: {summary.get('chain')}")
print(f" ํ”„๋กœ๊ทธ๋žจ: {summary.get('program_name')}")
print(f" ๋ฌธ์„œํƒ€์ž…: {summary.get('doc_type')}")
print(f" ์ถ”์ถœ ๋ชจ๋ธ: {summary.get('extractor_model')}")
if summary.get('subscription_programs_count'):
print(f" ๊ตฌ๋… ํ”„๋กœ๊ทธ๋žจ: {summary['subscription_programs_count']}๊ฐœ")
if summary.get('membership_tiers_count'):
print(f" ๋ฉค๋ฒ„์‹ญ ๋“ฑ๊ธ‰: {summary['membership_tiers_count']}๊ฐœ")
if summary.get('credit_cards_count'):
print(f" ์‹ ์šฉ์นด๋“œ: {summary['credit_cards_count']}๊ฐœ")