Spaces:
Running
Running
| """ | |
| YAML Frontmatter ๊ธฐ๋ฐ ์ง์ ๋ก๋ | |
| ================================ | |
| ์ด๋ฏธ ๊ตฌ์กฐํ๋ YAML frontmatter๊ฐ ์๋ ๋งํฌ๋ค์ด ํ์ผ์ ์ง์ ๋ก๋ํฉ๋๋ค. | |
| LLM ํธ์ถ ์์ด ๊ธฐ์กด ์ถ์ถ ๊ฒฐ๊ณผ๋ฅผ ํ์ฉํฉ๋๋ค. | |
| """ | |
| import yaml | |
| import frontmatter | |
| from pathlib import Path | |
| from typing import Dict, Any, List, Optional | |
| from datetime import datetime | |
| def load_extracted_knowledge(file_path: str) -> Dict[str, Any]: | |
| """ | |
| ๋งํฌ๋ค์ด ํ์ผ์ YAML frontmatter์์ extracted_knowledge๋ฅผ ๋ก๋ํฉ๋๋ค. | |
| ๋ ๊ฐ์ง YAML ๊ตฌ์กฐ๋ฅผ ์ง์: | |
| 1. extracted_knowledge ๋ํผ ์์ ์ค์ฒฉ๋ ๊ตฌ์กฐ (KOR, Terms) | |
| 2. ์ต์์ ๋ ๋ฒจ์ credit_cards ๋ฑ์ด ์ง์ ์๋ ๊ตฌ์กฐ (USA) | |
| Args: | |
| file_path: ๋งํฌ๋ค์ด ํ์ผ ๊ฒฝ๋ก | |
| Returns: | |
| extracted_knowledge ๋์ ๋๋ฆฌ ๋๋ ์ ์ฒด frontmatter | |
| """ | |
| # ์ง์ํ๋ ์ง์ ์ ํ ํค๋ค | |
| knowledge_keys = [ | |
| # ๊ธฐ์กด ํค๋ค | |
| 'credit_cards', 'membership_tiers', 'loyalty_program', | |
| 'subscription_programs', 'points_system', 'milestone_program', | |
| 'best_price_guarantee', 'point_exclusions', 'general_policies', | |
| 'common_card_features', 'hilton_honors_references', 'card_comparison_summary', | |
| 'facts', | |
| # ํธํ ํ๋กํผํฐ ๊ด๋ จ ํค๋ค (Pullman, Fairmont, Hotel Naru ๋ฑ) | |
| 'hotel_properties', 'hotel_facilities', 'room_types', 'tier_implementations', | |
| 'room_common_amenities', 'loyalty_program_features', 'pricing_analysis', | |
| 'ratings', 'nearby_attractions', 'channel_implementations', 'member_rates', | |
| 'dining_venues', 'room_service', 'policies', 'pros_cons', 'hotel_brands', | |
| 'benefits', 'promotion', 'exclusions', 'terms_and_conditions', 'points_policy' | |
| ] | |
| path = Path(file_path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค: {file_path}") | |
| with open(path, 'r', encoding='utf-8') as f: | |
| post = frontmatter.load(f) | |
| metadata = post.metadata | |
| # extracted_knowledge๊ฐ ์์ผ๋ฉด ์ฌ์ฉ | |
| if 'extracted_knowledge' in metadata: | |
| ek = metadata['extracted_knowledge'] | |
| else: | |
| # ์ต์์ ๋ ๋ฒจ์์ ์ง์ ํค๋ค ์์ง | |
| ek = {} | |
| for key in knowledge_keys: | |
| if key in metadata: | |
| ek[key] = metadata[key] | |
| # ์ง์ ๋ฐ์ดํฐ๊ฐ ์๋์ง ํ์ธ | |
| if ek: | |
| # ์ฒด์ธ ๊ฒฐ์ (๋ค์ํ ์์ค์์ ์๋) | |
| chain = None | |
| # 1. identity์์ ํ์ธ | |
| identity = metadata.get('identity', {}) | |
| chain = identity.get('chain') | |
| # 2. document_reference ๋๋ document_ref์์ ํ์ธ | |
| if not chain: | |
| doc_ref = metadata.get('document_reference', metadata.get('document_ref', {})) | |
| if isinstance(doc_ref, dict): | |
| if 'identity' in doc_ref: | |
| chain = doc_ref['identity'].get('chain') | |
| else: | |
| chain = doc_ref.get('chain') | |
| # 3. extracted_knowledge ๋ด๋ถ์์ ํ์ธ | |
| if not chain: | |
| if 'loyalty_program' in ek and ek['loyalty_program']: | |
| chain = ek['loyalty_program'].get('chain') | |
| elif 'points_system' in ek and ek['points_system']: | |
| chain = ek['points_system'].get('chain') | |
| elif 'credit_cards' in ek and ek['credit_cards']: | |
| chain = ek['credit_cards'][0].get('chain') | |
| elif 'membership_tiers' in ek and ek['membership_tiers']: | |
| chain = ek['membership_tiers'][0].get('chain') | |
| # 4. hotel_properties์์ ์ฒด์ธ ํ์ธ (Pullman, Hotel Naru ๋ฑ) | |
| elif 'hotel_properties' in ek and ek['hotel_properties']: | |
| chain = ek['hotel_properties'][0].get('chain') | |
| # 5. tier_implementations์์ ์ฒด์ธ ํ์ธ | |
| elif 'tier_implementations' in ek and ek['tier_implementations']: | |
| chain = ek['tier_implementations'][0].get('chain') | |
| # identity์ chain ์ถ๊ฐ | |
| if chain and not identity.get('chain'): | |
| identity['chain'] = chain | |
| return { | |
| 'file_path': str(file_path), | |
| 'identity': identity, | |
| 'source': metadata.get('source', {}), | |
| 'version': metadata.get('version', {}), | |
| 'extracted_knowledge': ek | |
| } | |
| else: | |
| return { | |
| 'file_path': str(file_path), | |
| 'metadata': metadata, | |
| 'has_extracted_knowledge': False | |
| } | |
| def load_all_from_directory( | |
| directory_path: str, | |
| pattern: str = "**/*.md" | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| ๋๋ ํ ๋ฆฌ ๋ด ๋ชจ๋ ๋งํฌ๋ค์ด ํ์ผ์์ extracted_knowledge๋ฅผ ๋ก๋ํฉ๋๋ค. | |
| Args: | |
| directory_path: ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก | |
| pattern: ํ์ผ ํจํด (๊ธฐ๋ณธ๊ฐ: "**/*.md") | |
| Returns: | |
| [{'file_path': ..., 'extracted_knowledge': ...}, ...] ๋ฆฌ์คํธ | |
| """ | |
| directory = Path(directory_path) | |
| if not directory.exists(): | |
| raise FileNotFoundError(f"๋๋ ํ ๋ฆฌ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค: {directory_path}") | |
| results = [] | |
| files = list(directory.glob(pattern)) | |
| print(f"๐ {len(files)}๊ฐ ํ์ผ ๋ฐ๊ฒฌ: {directory_path}") | |
| for file_path in files: | |
| try: | |
| data = load_extracted_knowledge(str(file_path)) | |
| if data.get('has_extracted_knowledge', True): | |
| results.append(data) | |
| print(f" โ {file_path.name}") | |
| else: | |
| print(f" โ ๏ธ {file_path.name} (extracted_knowledge ์์)") | |
| except Exception as e: | |
| print(f" โ {file_path.name}: {e}") | |
| continue | |
| return results | |
| def get_summary(data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| ๋ก๋๋ ๋ฐ์ดํฐ์ ์์ฝ ์ ๋ณด๋ฅผ ๋ฐํํฉ๋๋ค. | |
| """ | |
| identity = data.get('identity', {}) | |
| ek = data.get('extracted_knowledge', {}) | |
| summary = { | |
| 'file_path': data.get('file_path'), | |
| 'chain': identity.get('chain'), | |
| 'program_name': identity.get('program_name'), | |
| 'doc_type': identity.get('doc_type'), | |
| 'extraction_timestamp': ek.get('extraction_timestamp'), | |
| 'extractor_model': ek.get('extractor_model'), | |
| } | |
| # ์ฝํ ์ธ ์์ฝ | |
| if 'subscription_programs' in ek: | |
| summary['subscription_programs_count'] = len(ek['subscription_programs']) | |
| if 'membership_tiers' in ek: | |
| summary['membership_tiers_count'] = len(ek['membership_tiers']) | |
| if 'loyalty_program' in ek: | |
| summary['has_loyalty_program'] = True | |
| if 'credit_cards' in ek: | |
| summary['credit_cards_count'] = len(ek['credit_cards']) | |
| if 'benefits' in ek: | |
| summary['benefits_count'] = len(ek['benefits']) | |
| return summary | |
| if __name__ == "__main__": | |
| import json | |
| print("๐งช YAML Frontmatter ๋ก๋ ํ ์คํธ") | |
| print("=" * 60) | |
| # ๋๋ ํ ๋ฆฌ ๋ก๋ ํ ์คํธ | |
| results = load_all_from_directory("data/raw/Hotel") | |
| print(f"\n๐ ๋ก๋ ๊ฒฐ๊ณผ: {len(results)}๊ฐ ํ์ผ") | |
| print("=" * 60) | |
| for data in results: | |
| summary = get_summary(data) | |
| print(f"\n๐ {Path(summary['file_path']).name}") | |
| print(f" ์ฒด์ธ: {summary.get('chain')}") | |
| print(f" ํ๋ก๊ทธ๋จ: {summary.get('program_name')}") | |
| print(f" ๋ฌธ์ํ์ : {summary.get('doc_type')}") | |
| print(f" ์ถ์ถ ๋ชจ๋ธ: {summary.get('extractor_model')}") | |
| if summary.get('subscription_programs_count'): | |
| print(f" ๊ตฌ๋ ํ๋ก๊ทธ๋จ: {summary['subscription_programs_count']}๊ฐ") | |
| if summary.get('membership_tiers_count'): | |
| print(f" ๋ฉค๋ฒ์ญ ๋ฑ๊ธ: {summary['membership_tiers_count']}๊ฐ") | |
| if summary.get('credit_cards_count'): | |
| print(f" ์ ์ฉ์นด๋: {summary['credit_cards_count']}๊ฐ") | |