Spaces:
Sleeping
Sleeping
| """ | |
| File modification tracking for AVP ingest system. | |
| This module provides hash-based file change detection to enable incremental | |
| ingestion of .avp source files. | |
| """ | |
| import hashlib | |
| import json | |
| import os | |
| from datetime import datetime, timezone | |
| from typing import Dict, List, Tuple | |
| def _empty_metadata() -> Dict: | |
| return { | |
| "last_ingestion": None, | |
| "tracking_method": "hash", | |
| "source_files": {}, | |
| } | |
| def compute_file_hash(file_path: str) -> str: | |
| """ | |
| Compute SHA256 hash of file contents. | |
| Args: | |
| file_path: Path to the file to hash | |
| Returns: | |
| Hex string of SHA256 hash | |
| """ | |
| sha256 = hashlib.sha256() | |
| with open(file_path, 'rb') as f: | |
| # Read in 64KB chunks for memory efficiency | |
| for chunk in iter(lambda: f.read(65536), b''): | |
| sha256.update(chunk) | |
| return sha256.hexdigest() | |
| def _read_kb(kb_path: str): | |
| """Return parsed JSON contents of the knowledge base, or None if missing.""" | |
| if not os.path.exists(kb_path): | |
| return None | |
| with open(kb_path, 'r') as f: | |
| return json.load(f) | |
| def load_metadata(kb_path: str) -> Dict: | |
| """ | |
| Load metadata from knowledge base file. | |
| Args: | |
| kb_path: Path to knowledge base JSON file | |
| Returns: | |
| Metadata dict with 'source_files' and 'last_ingestion' keys. | |
| Returns empty metadata structure if file doesn't exist or is old format. | |
| """ | |
| data = _read_kb(kb_path) | |
| if isinstance(data, dict) and "metadata" in data: | |
| return data["metadata"] | |
| # Missing file, legacy list format, or unknown format | |
| return _empty_metadata() | |
| def load_existing_chunks(kb_path: str) -> List[Dict]: | |
| """ | |
| Load existing chunks from knowledge base file. | |
| Args: | |
| kb_path: Path to knowledge base JSON file | |
| Returns: | |
| List of chunk dicts | |
| """ | |
| data = _read_kb(kb_path) | |
| if isinstance(data, list): | |
| # Old format - direct array | |
| return data | |
| if isinstance(data, dict) and "chunks" in data: | |
| # New format with metadata wrapper | |
| return data["chunks"] | |
| return [] | |
| def save_with_metadata(chunks: List[Dict], metadata: Dict, kb_path: str): | |
| """ | |
| Save chunks and metadata to knowledge base file in new format. | |
| Args: | |
| chunks: List of code chunk dicts | |
| metadata: Metadata dict with tracking information | |
| kb_path: Path to knowledge base JSON file | |
| """ | |
| with open(kb_path, 'w') as f: | |
| json.dump({"metadata": metadata, "chunks": chunks}, f, indent=4) | |
| def detect_changed_files( | |
| data_folder: str, | |
| metadata: Dict, | |
| file_extension: str = ".avp", | |
| ) -> Tuple[List[str], List[str], List[str]]: | |
| """ | |
| Detect which files have changed since last ingestion using hash comparison. | |
| Args: | |
| data_folder: Directory containing source files | |
| metadata: Metadata dict with previous file hashes | |
| file_extension: File extension to search for (default: ".avp") | |
| Returns: | |
| Tuple of (changed_files, unchanged_files, deleted_files) as lists of full paths | |
| """ | |
| source_files = metadata.get("source_files", {}) | |
| changed_files: List[str] = [] | |
| unchanged_files: List[str] = [] | |
| current_files = set() | |
| for filename in os.listdir(data_folder): | |
| if not filename.endswith(file_extension): | |
| continue | |
| full_path = os.path.join(data_folder, filename) | |
| current_files.add(full_path) | |
| current_hash = compute_file_hash(full_path) | |
| previous_hash = source_files.get(full_path, {}).get("hash") | |
| if current_hash == previous_hash: | |
| unchanged_files.append(full_path) | |
| else: | |
| # New file (no previous hash) or modified file | |
| changed_files.append(full_path) | |
| # Detect deleted files: in metadata but no longer on disk | |
| deleted_files = [f for f in source_files if f not in current_files] | |
| return changed_files, unchanged_files, deleted_files | |
| def remove_deleted_from_metadata(metadata: Dict, deleted_files: List[str]) -> Dict: | |
| """ | |
| Remove deleted files from metadata. | |
| Args: | |
| metadata: Metadata dict to update | |
| deleted_files: List of file paths that have been deleted | |
| Returns: | |
| Updated metadata dict | |
| """ | |
| source_files = metadata.get("source_files", {}) | |
| for file_path in deleted_files: | |
| source_files.pop(file_path, None) | |
| return metadata | |
| def update_metadata( | |
| metadata: Dict, | |
| file_path: str, | |
| function_names: List[str], | |
| ) -> Dict: | |
| """ | |
| Update metadata for a specific file after ingestion. | |
| Args: | |
| metadata: Metadata dict to update | |
| file_path: Path to the file that was ingested | |
| function_names: List of function names extracted from this file | |
| Returns: | |
| Updated metadata dict | |
| """ | |
| metadata["source_files"][file_path] = { | |
| "hash": compute_file_hash(file_path), | |
| "mtime": os.path.getmtime(file_path), | |
| "size": os.path.getsize(file_path), | |
| "functions": function_names, | |
| } | |
| metadata["last_ingestion"] = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") | |
| return metadata | |