Spaces:
Running
Running
| """ | |
| Eodi ์บ์ ๊ด๋ฆฌ์ - ์ฆ๋ถ ์ฒ๋ฆฌ๋ฅผ ์ํ ์ฝํ ์ธ ๊ธฐ๋ฐ ์บ์ | |
| ================================================================ | |
| LLM ํธ์ถ ๊ฒฐ๊ณผ๋ฅผ ๋ก์ปฌ์ ์บ์ฑํ์ฌ ๋์ผ ์ ๋ ฅ์ ๋ํ ๋ถํ์ํ ์ฌ์ฒ๋ฆฌ๋ฅผ ๋ฐฉ์งํฉ๋๋ค. | |
| - Content-addressed cache (SHA256 ํด์ ๊ธฐ๋ฐ) | |
| - ์์์ ํ์ผ ์ฐ๊ธฐ (tempfile + os.replace) | |
| - ๊ฒ์ฆ๋ ๊ฒฐ๊ณผ๋ง ์บ์ | |
| - ๋ฉํ๋ฐ์ดํฐ ํฌํจ ์ ์ฅ | |
| """ | |
| import os | |
| import json | |
| import hashlib | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Optional, Dict, Any | |
| from datetime import datetime, timezone | |
| from src.core.schema import ExtractedKnowledge | |
| # ์บ์ ๋ฒ์ - ์บ์ ๊ตฌ์กฐ๊ฐ ๋ณ๊ฒฝ๋๋ฉด ๋ฒ์ ์ ์ฌ๋ ค ๊ธฐ์กด ์บ์ ๋ฌดํจํ | |
| CACHE_VERSION = "1.0.0" | |
| # ์ถ์ถ๊ธฐ ๋ฒ์ - ํ๋กฌํํธ/๋ก์ง ๋ณ๊ฒฝ ์ ๋ฒ์ ์ ์ฌ๋ ค ์บ์ ๋ฌดํจํ | |
| EXTRACTOR_VERSION = "1.0.0" | |
| class CacheManager: | |
| """ | |
| ์ฆ๋ถ ์ฒ๋ฆฌ ์บ์ ๊ด๋ฆฌ์. | |
| ํ์ผ ์ฝํ ์ธ ํด์์ ๋ชจ๋ธ ์ค์ ์ ๊ธฐ๋ฐ์ผ๋ก ์ถ์ถ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ/์กฐํํฉ๋๋ค. | |
| ์บ์ ํค ๊ตฌ์ฑ์์: | |
| - ํ์ผ ๋ด์ฉ (normalized) | |
| - ํ๋กฌํํธ ํ ํ๋ฆฟ ํด์ | |
| - ์คํค๋ง ๋ฒ์ | |
| - ์ถ์ถ๊ธฐ ๋ฒ์ | |
| - ๋ชจ๋ธ ํ๋ผ๋ฏธํฐ (provider, model_name, temperature) | |
| """ | |
| def __init__(self, cache_dir: str = ".eodi_cache"): | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| def _normalize_content(self, content: str) -> str: | |
| """์ฝํ ์ธ ์ ๊ทํ - ์ค ๋ ๋ฌธ์, ๊ณต๋ฐฑ ๋ฑ ํ์คํ""" | |
| # ์ค ๋ ๋ฌธ์ ํต์ผ (CRLF -> LF) | |
| normalized = content.replace('\r\n', '\n').replace('\r', '\n') | |
| # ํธ๋ ์ผ๋ง ๊ณต๋ฐฑ ์ ๊ฑฐ | |
| lines = [line.rstrip() for line in normalized.split('\n')] | |
| return '\n'.join(lines) | |
| def _generate_key( | |
| self, | |
| file_content: str, | |
| prompt_template: str, | |
| model_config: Dict[str, Any] | |
| ) -> str: | |
| """ | |
| ๊ณ ์ ์บ์ ํค ์์ฑ. | |
| Args: | |
| file_content: ์ ๊ทํ๋ ๋งํฌ๋ค์ด ์ฝํ ์ธ | |
| prompt_template: ์ถ์ถ ํ๋กฌํํธ ํ ํ๋ฆฟ | |
| model_config: ๋ชจ๋ธ ์ค์ (provider, model_name, temperature ๋ฑ) | |
| Returns: | |
| SHA256 ํด์ ๋ฌธ์์ด | |
| """ | |
| # ์ ๊ทํ๋ ์ฝํ ์ธ | |
| normalized_content = self._normalize_content(file_content) | |
| # ๋ชจ๋ธ ์ค์ ์ ์ ๋ ฌ๋ JSON ๋ฌธ์์ด๋ก ๋ณํ (์ผ๊ด๋ ํด์๋ฅผ ์ํด) | |
| sorted_config = json.dumps(model_config, sort_keys=True, ensure_ascii=False) | |
| # ํ๋กฌํํธ ํ ํ๋ฆฟ ํด์ (๋์ ๋ถ๋ถ ์ ์ธํ ํ ํ๋ฆฟ ๊ตฌ์กฐ) | |
| prompt_hash = hashlib.sha256(prompt_template.encode('utf-8')).hexdigest()[:16] | |
| # ๋ชจ๋ ๊ตฌ์ฑ์์ ๊ฒฐํฉ | |
| combined = ( | |
| f"v{CACHE_VERSION}|" | |
| f"e{EXTRACTOR_VERSION}|" | |
| f"p{prompt_hash}|" | |
| f"m{sorted_config}|" | |
| f"c{normalized_content}" | |
| ) | |
| return hashlib.sha256(combined.encode('utf-8')).hexdigest() | |
| def _create_cache_entry( | |
| self, | |
| file_content: str, | |
| prompt_template: str, | |
| model_config: Dict[str, Any], | |
| result: ExtractedKnowledge, | |
| validation_warnings: Optional[list] = None | |
| ) -> Dict[str, Any]: | |
| """์บ์ ์ํธ๋ฆฌ ์์ฑ - ๊ฒฐ๊ณผ์ ๋ฉํ๋ฐ์ดํฐ ํฌํจ""" | |
| return { | |
| "cache_version": CACHE_VERSION, | |
| "extractor_version": EXTRACTOR_VERSION, | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "input_checksum": hashlib.sha256(file_content.encode('utf-8')).hexdigest(), | |
| "prompt_hash": hashlib.sha256(prompt_template.encode('utf-8')).hexdigest()[:16], | |
| "model_config": model_config, | |
| "validation_status": "passed", | |
| "validation_warnings": validation_warnings or [], | |
| "output": result.model_dump(mode='json') # Pydantic v2 JSON ๋ชจ๋ - Decimal/datetime ์์ | |
| } | |
| def get_cached_result( | |
| self, | |
| file_content: str, | |
| prompt_template: str, | |
| model_config: Dict[str, Any] | |
| ) -> Optional[ExtractedKnowledge]: | |
| """ | |
| ์บ์๋ ๊ฒฐ๊ณผ ์กฐํ. | |
| Args: | |
| file_content: ์๋ณธ ๋งํฌ๋ค์ด ์ฝํ ์ธ | |
| prompt_template: ํ๋กฌํํธ ํ ํ๋ฆฟ | |
| model_config: ๋ชจ๋ธ ์ค์ | |
| Returns: | |
| ExtractedKnowledge ๊ฐ์ฒด ๋๋ None (์บ์ ๋ฏธ์ค) | |
| """ | |
| key = self._generate_key(file_content, prompt_template, model_config) | |
| cache_file = self.cache_dir / f"{key}.json" | |
| if not cache_file.exists(): | |
| return None | |
| try: | |
| with open(cache_file, 'r', encoding='utf-8') as f: | |
| entry = json.load(f) | |
| # ์บ์ ๋ฒ์ ํ์ธ - ๋ฒ์ ๋ถ์ผ์น ์ ๋ฌดํจํ | |
| if entry.get("cache_version") != CACHE_VERSION: | |
| print(f"โ ๏ธ ์บ์ ๋ฒ์ ๋ถ์ผ์น, ๋ฌดํจํ: {entry.get('cache_version')} != {CACHE_VERSION}") | |
| cache_file.unlink() # ์ค๋๋ ์บ์ ์ญ์ | |
| return None | |
| # ์ถ์ถ๊ธฐ ๋ฒ์ ํ์ธ | |
| if entry.get("extractor_version") != EXTRACTOR_VERSION: | |
| print(f"โ ๏ธ ์ถ์ถ๊ธฐ ๋ฒ์ ๋ถ์ผ์น, ๋ฌดํจํ: {entry.get('extractor_version')} != {EXTRACTOR_VERSION}") | |
| cache_file.unlink() | |
| return None | |
| # ๊ฒ์ฆ ์ํ ํ์ธ - ์คํจํ ๊ฒฐ๊ณผ๋ ์ฌ์ฉํ์ง ์์ | |
| if entry.get("validation_status") != "passed": | |
| print(f"โ ๏ธ ์บ์๋ ๊ฒฐ๊ณผ๊ฐ ๊ฒ์ฆ ์คํจ ์ํ, ๋ฌดํจํ") | |
| cache_file.unlink() | |
| return None | |
| # Pydantic ๋ชจ๋ธ๋ก ๋ณํ | |
| return ExtractedKnowledge.model_validate(entry["output"]) | |
| except (json.JSONDecodeError, KeyError, ValueError) as e: | |
| print(f"โ ๏ธ ์บ์ ๋ก๋ ์ค๋ฅ ({key[:12]}...): {e}") | |
| # ์์๋ ์บ์ ํ์ผ ์ญ์ | |
| try: | |
| cache_file.unlink() | |
| except OSError: | |
| pass | |
| return None | |
| def save_result( | |
| self, | |
| file_content: str, | |
| prompt_template: str, | |
| model_config: Dict[str, Any], | |
| result: ExtractedKnowledge, | |
| validation_warnings: Optional[list] = None | |
| ) -> bool: | |
| """ | |
| ์ถ์ถ ๊ฒฐ๊ณผ๋ฅผ ์บ์์ ์ ์ฅ (์์์ ์ฐ๊ธฐ). | |
| Args: | |
| file_content: ์๋ณธ ๋งํฌ๋ค์ด ์ฝํ ์ธ | |
| prompt_template: ํ๋กฌํํธ ํ ํ๋ฆฟ | |
| model_config: ๋ชจ๋ธ ์ค์ | |
| result: ๊ฒ์ฆ ์๋ฃ๋ ExtractedKnowledge | |
| validation_warnings: ๊ฒ์ฆ ๊ฒฝ๊ณ ๋ชฉ๋ก (์ ํ) | |
| Returns: | |
| ์ ์ฅ ์ฑ๊ณต ์ฌ๋ถ | |
| """ | |
| key = self._generate_key(file_content, prompt_template, model_config) | |
| cache_file = self.cache_dir / f"{key}.json" | |
| try: | |
| # ์บ์ ์ํธ๋ฆฌ ์์ฑ | |
| entry = self._create_cache_entry( | |
| file_content, | |
| prompt_template, | |
| model_config, | |
| result, | |
| validation_warnings | |
| ) | |
| # ์์์ ์ฐ๊ธฐ: tempfile -> os.replace | |
| # ์ด๋ ๊ฒ ํ๋ฉด ์ค๊ฐ์ ํ๋ก์ธ์ค๊ฐ ์ฃฝ์ด๋ ์บ์ ํ์ผ์ด ๊นจ์ง์ง ์์ | |
| fd, temp_path = tempfile.mkstemp( | |
| suffix='.json.tmp', | |
| dir=self.cache_dir, | |
| prefix=f'{key[:8]}_' | |
| ) | |
| try: | |
| with os.fdopen(fd, 'w', encoding='utf-8') as f: | |
| json.dump(entry, f, ensure_ascii=False, indent=2) | |
| # ์์์ ๊ต์ฒด (POSIX ์์คํ ์์ atomic) | |
| os.replace(temp_path, cache_file) | |
| return True | |
| except Exception as e: | |
| # ์์ ํ์ผ ์ ๋ฆฌ | |
| try: | |
| os.unlink(temp_path) | |
| except OSError: | |
| pass | |
| raise e | |
| except Exception as e: | |
| print(f"โ ๏ธ ์บ์ ์ ์ฅ ์ค๋ฅ ({key[:12]}...): {e}") | |
| return False | |
| def get_cache_stats(self) -> Dict[str, Any]: | |
| """์บ์ ํต๊ณ ์กฐํ""" | |
| cache_files = list(self.cache_dir.glob("*.json")) | |
| total_size = sum(f.stat().st_size for f in cache_files) | |
| return { | |
| "cache_dir": str(self.cache_dir), | |
| "cache_version": CACHE_VERSION, | |
| "extractor_version": EXTRACTOR_VERSION, | |
| "file_count": len(cache_files), | |
| "total_size_bytes": total_size, | |
| "total_size_mb": round(total_size / (1024 * 1024), 2) | |
| } | |
| def clear_cache(self) -> int: | |
| """์ ์ฒด ์บ์ ์ญ์ , ์ญ์ ๋ ํ์ผ ์ ๋ฐํ""" | |
| cache_files = list(self.cache_dir.glob("*.json")) | |
| count = 0 | |
| for f in cache_files: | |
| try: | |
| f.unlink() | |
| count += 1 | |
| except OSError: | |
| pass | |
| return count | |