File size: 3,802 Bytes
775b78b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# indexer/crawler.py

import os
import hashlib
import yaml


class Crawler:
    """
    Discovers files in configured directories and tracks which ones
    are new or modified using SHA-256 hashing.
    """

    def __init__(self, config_path="config.yaml"):
        """
        Load the config file and store the settings as instance variables.
        """
        config_path = os.path.abspath(config_path)
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)

        config_dir = os.path.dirname(config_path)
        self.watch_paths = [
            path if os.path.isabs(path) else os.path.normpath(os.path.join(config_dir, path))
            for path in config["watch_paths"]
        ]
        self.include_extensions = config["include_extensions"]
        self.skip_directories = config["skip_directories"]
        data_dir = config["data_dir"]
        self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))

    def discover_files(self):
        """
        Walk through all watch_paths recursively and collect every file
        that matches include_extensions, skipping skip_directories.

        Returns:
            list[str] β€” list of absolute file paths
        """
        results=[]
        for path in self.watch_paths:
            for dirpath, dirnames, filenames in os.walk(path):
                for filename in filenames:
                    if os.path.splitext(filename)[1] in self.include_extensions:
                        full_path = os.path.join(dirpath, filename)
                        results.append(full_path)
                dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
        return results        

        
    def compute_hash(self, filepath):
        """
        Compute the SHA-256 hash of a file's contents.

        Args:
            filepath (str) β€” absolute path to the file

        Returns:
            str β€” hex string of the SHA-256 hash
        """
        hasher = hashlib.sha256()
        with open(filepath, "rb") as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()   

    def get_new_and_modified(self, known_hashes=None):
        """
        Compare discovered files against previously known hashes to find
        which files are new or have been modified since last run.

        Args:
            known_hashes (dict) β€” {filepath: hash} from previous run
                                   Pass None or {} on first run.

        Returns:
            tuple: (files_to_process, current_hashes, deleted_files)
            - files_to_process: list[str] β€” paths that are new or changed
            - current_hashes: dict β€” {filepath: hash} for ALL current files
            - deleted files: list[str] β€” files that were deleted
        """
        if known_hashes is None:
            known_hashes = {}
        current_files = self.discover_files()
        files_to_process = []
        current_hashes = {}
        for file in current_files:
            file_hash = self.compute_hash(file)
            if file not in known_hashes or file_hash != known_hashes[file]:
                files_to_process.append(file)
            current_hashes[file] = file_hash
        
        deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
        
        return files_to_process, current_hashes, deleted_files


# --- Test it ---
if __name__ == "__main__":
    crawler = Crawler()
    files = crawler.discover_files()
    print(f"Found {len(files)} files:")
    for f in files:
        print(f"  {f}")

    print("\n--- Checking for new/modified ---")
    to_process, hashes = crawler.get_new_and_modified()
    print(f"{len(to_process)} files to process")