Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ | |
| YAML ์ปค๋ฒ๋ฆฌ์ง ๋ถ์ ๋๊ตฌ | |
| ======================= | |
| ๊ฐ ๋งํฌ๋ค์ด ํ์ผ์ YAML์์: | |
| 1. ์ด๋ค ํค๊ฐ ์กด์ฌํ๋์ง ์์ง | |
| 2. ์ด๋ค ํค๊ฐ ์ฒญํฌ๋ก ๋ณํ๋์๋์ง ๋น๊ต | |
| 3. ๋๋ฝ๋ (๋ฏธ์ฒ๋ฆฌ) ํค ๋ฆฌํฌํธ | |
| ์ฌ์ฉ๋ฒ: | |
| python scripts/analyze_yaml_coverage.py | |
| """ | |
| import yaml | |
| from pathlib import Path | |
| from collections import defaultdict | |
| from typing import Dict, Set, Any, List | |
| import click | |
| # ํ์ฌ sync_to_supabase.py์์ ์ฒ๋ฆฌํ๋ ํค ๋ชฉ๋ก | |
| PROCESSED_KEYS = { | |
| # extracted_knowledge ์ง์ ํ์ | |
| "loyalty_programs", | |
| "membership_tiers", | |
| "points_systems", | |
| "credit_cards", | |
| "benefits", | |
| "hotel_properties", | |
| "tier_implementations", | |
| "subscription_programs", | |
| "promotions", | |
| "pricing", | |
| "pricing_analysis", | |
| # ๋ฉํ๋ฐ์ดํฐ (์ฒญํฌ ๋ถํ์) | |
| "extraction_timestamp", | |
| "extractor_model", | |
| # facts ํ์ | |
| "facts", # facts.pricing_analysis ์ฒ๋ฆฌ๋จ | |
| } | |
| # ์ฒญํฌ ์์ฑ ๋ถํ์ (๋ฉํ๋ฐ์ดํฐ ๋ฑ) | |
| IGNORED_KEYS = { | |
| "evidence", | |
| "extra_attributes", | |
| "tips", | |
| "warnings", | |
| "reviewer_quotes", | |
| "pros_cons", | |
| "recommended_for", | |
| "not_recommended_for", | |
| "verification_needed", | |
| "user_tips", | |
| "source_info", | |
| "version_info", | |
| } | |
| def flatten_keys(obj: Any, prefix: str = "", depth: int = 0, max_depth: int = 3) -> Set[str]: | |
| """๊ฐ์ฒด์์ ๋ชจ๋ ํค ๊ฒฝ๋ก ์ถ์ถ""" | |
| keys = set() | |
| if depth > max_depth: | |
| return keys | |
| if isinstance(obj, dict): | |
| for key, value in obj.items(): | |
| full_key = f"{prefix}.{key}" if prefix else key | |
| keys.add(full_key) | |
| # ๋ฐฐ์ด์ด ์๋ ๊ฒฝ์ฐ๋ง ์ฌ๊ท | |
| if not isinstance(value, list): | |
| keys.update(flatten_keys(value, full_key, depth + 1, max_depth)) | |
| return keys | |
| def get_top_level_keys(obj: Dict) -> Set[str]: | |
| """์ต์์ ํค๋ง ์ถ์ถ""" | |
| if not isinstance(obj, dict): | |
| return set() | |
| return set(obj.keys()) | |
| def analyze_file(file_path: Path) -> Dict[str, Any]: | |
| """๋จ์ผ ํ์ผ ๋ถ์""" | |
| try: | |
| content = file_path.read_text(encoding='utf-8') | |
| if '---' not in content: | |
| return {"error": "YAML ์์"} | |
| yaml_part = content.split('---')[1] | |
| data = yaml.safe_load(yaml_part) | |
| if not data: | |
| return {"error": "๋น YAML"} | |
| # extracted_knowledge ์ถ์ถ | |
| ek = data.get('extracted_knowledge', {}) | |
| if not ek: | |
| return {"error": "extracted_knowledge ์์"} | |
| # ์ต์์ ํค ์ถ์ถ | |
| top_level = get_top_level_keys(ek) | |
| # ์ฒ๋ฆฌ๋ ํค | |
| processed = top_level & PROCESSED_KEYS | |
| # ๋ฏธ์ฒ๋ฆฌ ํค (๋ฌด์ํ ํค ์ ์ธ) | |
| unprocessed = top_level - PROCESSED_KEYS - IGNORED_KEYS | |
| # facts ํ์ ๋ถ์ | |
| facts = ek.get('facts', {}) | |
| facts_keys = get_top_level_keys(facts) if isinstance(facts, dict) else set() | |
| return { | |
| "total_keys": len(top_level), | |
| "processed": list(processed), | |
| "unprocessed": list(unprocessed), | |
| "facts_keys": list(facts_keys) if facts_keys else [], | |
| "ignored": list(top_level & IGNORED_KEYS), | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def analyze_all_files(data_dir: Path) -> Dict[str, Any]: | |
| """๋ชจ๋ ํ์ผ ๋ถ์""" | |
| results = {} | |
| all_keys = defaultdict(int) | |
| unprocessed_summary = defaultdict(list) | |
| for md_file in data_dir.rglob("*.md"): | |
| rel_path = md_file.relative_to(data_dir) | |
| result = analyze_file(md_file) | |
| results[str(rel_path)] = result | |
| if "error" not in result: | |
| for key in result.get("unprocessed", []): | |
| all_keys[key] += 1 | |
| unprocessed_summary[key].append(str(rel_path.stem)[:30]) | |
| return { | |
| "files": results, | |
| "summary": { | |
| "total_files": len(results), | |
| "success": sum(1 for r in results.values() if "error" not in r), | |
| "unprocessed_keys": dict(all_keys), | |
| "unprocessed_files": {k: v[:3] for k, v in unprocessed_summary.items()} | |
| } | |
| } | |
| def main(verbose: bool, file: str): | |
| """YAML ์ปค๋ฒ๋ฆฌ์ง ๋ถ์""" | |
| print("๐ YAML ์ปค๋ฒ๋ฆฌ์ง ๋ถ์") | |
| print("=" * 60) | |
| data_dir = Path("data/raw/Hotel") | |
| if file: | |
| # ํน์ ํ์ผ ๋ถ์ | |
| file_path = Path(file) | |
| if not file_path.exists(): | |
| file_path = data_dir / file | |
| if not file_path.exists(): | |
| print(f"โ ํ์ผ์ ์ฐพ์ ์ ์์ต๋๋ค: {file}") | |
| return | |
| result = analyze_file(file_path) | |
| print(f"\n๐ {file_path.name}") | |
| if "error" in result: | |
| print(f" โ {result['error']}") | |
| else: | |
| print(f" ์ด ํค: {result['total_keys']}๊ฐ") | |
| print(f" โ ์ฒ๋ฆฌ๋จ: {result['processed']}") | |
| print(f" โ ๋ฏธ์ฒ๋ฆฌ: {result['unprocessed']}") | |
| if result['facts_keys']: | |
| print(f" ๐ facts ํ์: {result['facts_keys']}") | |
| return | |
| # ์ ์ฒด ๋ถ์ | |
| analysis = analyze_all_files(data_dir) | |
| summary = analysis["summary"] | |
| print(f"\n๐ ์์ฝ") | |
| print(f" ์ด ํ์ผ: {summary['total_files']}๊ฐ") | |
| print(f" ์ฑ๊ณต: {summary['success']}๊ฐ") | |
| unprocessed = summary.get("unprocessed_keys", {}) | |
| if unprocessed: | |
| print(f"\nโ ๏ธ ๋ฏธ์ฒ๋ฆฌ ํค (์ฒญํฌ๋ก ๋ณํ๋์ง ์์ ํค)") | |
| print("-" * 60) | |
| for key, count in sorted(unprocessed.items(), key=lambda x: -x[1]): | |
| files = summary["unprocessed_files"].get(key, []) | |
| file_str = ", ".join(files) | |
| print(f" {key:30} : {count:3}๊ฐ ํ์ผ ({file_str}...)") | |
| else: | |
| print("\nโ ๋ชจ๋ ํค๊ฐ ์ฒ๋ฆฌ๋๊ณ ์์ต๋๋ค!") | |
| if verbose: | |
| print("\n๐ ํ์ผ๋ณ ์์ธ") | |
| print("-" * 60) | |
| for path, result in analysis["files"].items(): | |
| if "error" in result: | |
| print(f" โ {path}: {result['error']}") | |
| elif result.get("unprocessed"): | |
| print(f" โ ๏ธ {path}") | |
| print(f" ๋ฏธ์ฒ๋ฆฌ: {result['unprocessed']}") | |
| if __name__ == "__main__": | |
| main() | |