eodi-mcp / scripts /analyze_yaml_coverage.py
lovelymango's picture
Upload 19 files
4c3c97b verified
#!/usr/bin/env python
"""
YAML ์ปค๋ฒ„๋ฆฌ์ง€ ๋ถ„์„ ๋„๊ตฌ
=======================
๊ฐ ๋งˆํฌ๋‹ค์šด ํŒŒ์ผ์˜ YAML์—์„œ:
1. ์–ด๋–ค ํ‚ค๊ฐ€ ์กด์žฌํ•˜๋Š”์ง€ ์ˆ˜์ง‘
2. ์–ด๋–ค ํ‚ค๊ฐ€ ์ฒญํฌ๋กœ ๋ณ€ํ™˜๋˜์—ˆ๋Š”์ง€ ๋น„๊ต
3. ๋ˆ„๋ฝ๋œ (๋ฏธ์ฒ˜๋ฆฌ) ํ‚ค ๋ฆฌํฌํŠธ
์‚ฌ์šฉ๋ฒ•:
python scripts/analyze_yaml_coverage.py
"""
import yaml
from pathlib import Path
from collections import defaultdict
from typing import Dict, Set, Any, List
import click
# ํ˜„์žฌ sync_to_supabase.py์—์„œ ์ฒ˜๋ฆฌํ•˜๋Š” ํ‚ค ๋ชฉ๋ก
PROCESSED_KEYS = {
# extracted_knowledge ์ง์ ‘ ํ•˜์œ„
"loyalty_programs",
"membership_tiers",
"points_systems",
"credit_cards",
"benefits",
"hotel_properties",
"tier_implementations",
"subscription_programs",
"promotions",
"pricing",
"pricing_analysis",
# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ (์ฒญํฌ ๋ถˆํ•„์š”)
"extraction_timestamp",
"extractor_model",
# facts ํ•˜์œ„
"facts", # facts.pricing_analysis ์ฒ˜๋ฆฌ๋จ
}
# ์ฒญํฌ ์ƒ์„ฑ ๋ถˆํ•„์š” (๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋“ฑ)
IGNORED_KEYS = {
"evidence",
"extra_attributes",
"tips",
"warnings",
"reviewer_quotes",
"pros_cons",
"recommended_for",
"not_recommended_for",
"verification_needed",
"user_tips",
"source_info",
"version_info",
}
def flatten_keys(obj: Any, prefix: str = "", depth: int = 0, max_depth: int = 3) -> Set[str]:
"""๊ฐ์ฒด์—์„œ ๋ชจ๋“  ํ‚ค ๊ฒฝ๋กœ ์ถ”์ถœ"""
keys = set()
if depth > max_depth:
return keys
if isinstance(obj, dict):
for key, value in obj.items():
full_key = f"{prefix}.{key}" if prefix else key
keys.add(full_key)
# ๋ฐฐ์—ด์ด ์•„๋‹Œ ๊ฒฝ์šฐ๋งŒ ์žฌ๊ท€
if not isinstance(value, list):
keys.update(flatten_keys(value, full_key, depth + 1, max_depth))
return keys
def get_top_level_keys(obj: Dict) -> Set[str]:
"""์ตœ์ƒ์œ„ ํ‚ค๋งŒ ์ถ”์ถœ"""
if not isinstance(obj, dict):
return set()
return set(obj.keys())
def analyze_file(file_path: Path) -> Dict[str, Any]:
"""๋‹จ์ผ ํŒŒ์ผ ๋ถ„์„"""
try:
content = file_path.read_text(encoding='utf-8')
if '---' not in content:
return {"error": "YAML ์—†์Œ"}
yaml_part = content.split('---')[1]
data = yaml.safe_load(yaml_part)
if not data:
return {"error": "๋นˆ YAML"}
# extracted_knowledge ์ถ”์ถœ
ek = data.get('extracted_knowledge', {})
if not ek:
return {"error": "extracted_knowledge ์—†์Œ"}
# ์ตœ์ƒ์œ„ ํ‚ค ์ถ”์ถœ
top_level = get_top_level_keys(ek)
# ์ฒ˜๋ฆฌ๋œ ํ‚ค
processed = top_level & PROCESSED_KEYS
# ๋ฏธ์ฒ˜๋ฆฌ ํ‚ค (๋ฌด์‹œํ•  ํ‚ค ์ œ์™ธ)
unprocessed = top_level - PROCESSED_KEYS - IGNORED_KEYS
# facts ํ•˜์œ„ ๋ถ„์„
facts = ek.get('facts', {})
facts_keys = get_top_level_keys(facts) if isinstance(facts, dict) else set()
return {
"total_keys": len(top_level),
"processed": list(processed),
"unprocessed": list(unprocessed),
"facts_keys": list(facts_keys) if facts_keys else [],
"ignored": list(top_level & IGNORED_KEYS),
}
except Exception as e:
return {"error": str(e)}
def analyze_all_files(data_dir: Path) -> Dict[str, Any]:
"""๋ชจ๋“  ํŒŒ์ผ ๋ถ„์„"""
results = {}
all_keys = defaultdict(int)
unprocessed_summary = defaultdict(list)
for md_file in data_dir.rglob("*.md"):
rel_path = md_file.relative_to(data_dir)
result = analyze_file(md_file)
results[str(rel_path)] = result
if "error" not in result:
for key in result.get("unprocessed", []):
all_keys[key] += 1
unprocessed_summary[key].append(str(rel_path.stem)[:30])
return {
"files": results,
"summary": {
"total_files": len(results),
"success": sum(1 for r in results.values() if "error" not in r),
"unprocessed_keys": dict(all_keys),
"unprocessed_files": {k: v[:3] for k, v in unprocessed_summary.items()}
}
}
@click.command()
@click.option('--verbose', '-v', is_flag=True, help='์ž์„ธํ•œ ์ถœ๋ ฅ')
@click.option('--file', '-f', type=str, default=None, help='ํŠน์ • ํŒŒ์ผ๋งŒ ๋ถ„์„')
def main(verbose: bool, file: str):
"""YAML ์ปค๋ฒ„๋ฆฌ์ง€ ๋ถ„์„"""
print("๐Ÿ” YAML ์ปค๋ฒ„๋ฆฌ์ง€ ๋ถ„์„")
print("=" * 60)
data_dir = Path("data/raw/Hotel")
if file:
# ํŠน์ • ํŒŒ์ผ ๋ถ„์„
file_path = Path(file)
if not file_path.exists():
file_path = data_dir / file
if not file_path.exists():
print(f"โŒ ํŒŒ์ผ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {file}")
return
result = analyze_file(file_path)
print(f"\n๐Ÿ“„ {file_path.name}")
if "error" in result:
print(f" โŒ {result['error']}")
else:
print(f" ์ด ํ‚ค: {result['total_keys']}๊ฐœ")
print(f" โœ… ์ฒ˜๋ฆฌ๋จ: {result['processed']}")
print(f" โŒ ๋ฏธ์ฒ˜๋ฆฌ: {result['unprocessed']}")
if result['facts_keys']:
print(f" ๐Ÿ“ facts ํ•˜์œ„: {result['facts_keys']}")
return
# ์ „์ฒด ๋ถ„์„
analysis = analyze_all_files(data_dir)
summary = analysis["summary"]
print(f"\n๐Ÿ“Š ์š”์•ฝ")
print(f" ์ด ํŒŒ์ผ: {summary['total_files']}๊ฐœ")
print(f" ์„ฑ๊ณต: {summary['success']}๊ฐœ")
unprocessed = summary.get("unprocessed_keys", {})
if unprocessed:
print(f"\nโš ๏ธ ๋ฏธ์ฒ˜๋ฆฌ ํ‚ค (์ฒญํฌ๋กœ ๋ณ€ํ™˜๋˜์ง€ ์•Š์€ ํ‚ค)")
print("-" * 60)
for key, count in sorted(unprocessed.items(), key=lambda x: -x[1]):
files = summary["unprocessed_files"].get(key, [])
file_str = ", ".join(files)
print(f" {key:30} : {count:3}๊ฐœ ํŒŒ์ผ ({file_str}...)")
else:
print("\nโœ… ๋ชจ๋“  ํ‚ค๊ฐ€ ์ฒ˜๋ฆฌ๋˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค!")
if verbose:
print("\n๐Ÿ“„ ํŒŒ์ผ๋ณ„ ์ƒ์„ธ")
print("-" * 60)
for path, result in analysis["files"].items():
if "error" in result:
print(f" โŒ {path}: {result['error']}")
elif result.get("unprocessed"):
print(f" โš ๏ธ {path}")
print(f" ๋ฏธ์ฒ˜๋ฆฌ: {result['unprocessed']}")
if __name__ == "__main__":
main()