Spaces:

JackSparrow89
/

Semantic_File

Sleeping

File size: 3,802 Bytes

775b78b

# indexer/crawler.py

import os
import hashlib
import yaml


class Crawler:
    """
    Discovers files in configured directories and tracks which ones
    are new or modified using SHA-256 hashing.
    """

    def __init__(self, config_path="config.yaml"):
        """
        Load the config file and store the settings as instance variables.
        """
        config_path = os.path.abspath(config_path)
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)

        config_dir = os.path.dirname(config_path)
        self.watch_paths = [
            path if os.path.isabs(path) else os.path.normpath(os.path.join(config_dir, path))
            for path in config["watch_paths"]
        ]
        self.include_extensions = config["include_extensions"]
        self.skip_directories = config["skip_directories"]
        data_dir = config["data_dir"]
        self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))

    def discover_files(self):
        """
        Walk through all watch_paths recursively and collect every file
        that matches include_extensions, skipping skip_directories.

        Returns:
            list[str] — list of absolute file paths
        """
        results=[]
        for path in self.watch_paths:
            for dirpath, dirnames, filenames in os.walk(path):
                for filename in filenames:
                    if os.path.splitext(filename)[1] in self.include_extensions:
                        full_path = os.path.join(dirpath, filename)
                        results.append(full_path)
                dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
        return results        

        
    def compute_hash(self, filepath):
        """
        Compute the SHA-256 hash of a file's contents.

        Args:
            filepath (str) — absolute path to the file

        Returns:
            str — hex string of the SHA-256 hash
        """
        hasher = hashlib.sha256()
        with open(filepath, "rb") as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()   

    def get_new_and_modified(self, known_hashes=None):
        """
        Compare discovered files against previously known hashes to find
        which files are new or have been modified since last run.

        Args:
            known_hashes (dict) — {filepath: hash} from previous run
                                   Pass None or {} on first run.

        Returns:
            tuple: (files_to_process, current_hashes, deleted_files)
            - files_to_process: list[str] — paths that are new or changed
            - current_hashes: dict — {filepath: hash} for ALL current files
            - deleted files: list[str] — files that were deleted
        """
        if known_hashes is None:
            known_hashes = {}
        current_files = self.discover_files()
        files_to_process = []
        current_hashes = {}
        for file in current_files:
            file_hash = self.compute_hash(file)
            if file not in known_hashes or file_hash != known_hashes[file]:
                files_to_process.append(file)
            current_hashes[file] = file_hash
        
        deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
        
        return files_to_process, current_hashes, deleted_files


# --- Test it ---
if __name__ == "__main__":
    crawler = Crawler()
    files = crawler.discover_files()
    print(f"Found {len(files)} files:")
    for f in files:
        print(f"  {f}")

    print("\n--- Checking for new/modified ---")
    to_process, hashes = crawler.get_new_and_modified()
    print(f"{len(to_process)} files to process")