JackSparrow89 commited on
Commit
775b78b
Β·
verified Β·
1 Parent(s): 1bbc850

Update indexer/crawler.py

Browse files
Files changed (1) hide show
  1. indexer/crawler.py +108 -102
indexer/crawler.py CHANGED
@@ -1,102 +1,108 @@
1
- # indexer/crawler.py
2
-
3
- import os
4
- import hashlib
5
- import yaml
6
-
7
-
8
- class Crawler:
9
- """
10
- Discovers files in configured directories and tracks which ones
11
- are new or modified using SHA-256 hashing.
12
- """
13
-
14
- def __init__(self, config_path="config.yaml"):
15
- """
16
- Load the config file and store the settings as instance variables.
17
- """
18
- with open(config_path, "r") as f:
19
- config = yaml.safe_load(f)
20
-
21
- self.watch_paths = config["watch_paths"]
22
- self.include_extensions = config["include_extensions"]
23
- self.skip_directories = config["skip_directories"]
24
- self.data_dir = config["data_dir"]
25
-
26
- def discover_files(self):
27
- """
28
- Walk through all watch_paths recursively and collect every file
29
- that matches include_extensions, skipping skip_directories.
30
-
31
- Returns:
32
- list[str] β€” list of absolute file paths
33
- """
34
- results=[]
35
- for path in self.watch_paths:
36
- for dirpath, dirnames, filenames in os.walk(path):
37
- for filename in filenames:
38
- if os.path.splitext(filename)[1] in self.include_extensions:
39
- full_path = os.path.join(dirpath, filename)
40
- results.append(full_path)
41
- dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
42
- return results
43
-
44
-
45
- def compute_hash(self, filepath):
46
- """
47
- Compute the SHA-256 hash of a file's contents.
48
-
49
- Args:
50
- filepath (str) β€” absolute path to the file
51
-
52
- Returns:
53
- str β€” hex string of the SHA-256 hash
54
- """
55
- hasher = hashlib.sha256()
56
- with open(filepath, "rb") as f:
57
- while chunk := f.read(8192):
58
- hasher.update(chunk)
59
- return hasher.hexdigest()
60
-
61
- def get_new_and_modified(self, known_hashes=None):
62
- """
63
- Compare discovered files against previously known hashes to find
64
- which files are new or have been modified since last run.
65
-
66
- Args:
67
- known_hashes (dict) β€” {filepath: hash} from previous run
68
- Pass None or {} on first run.
69
-
70
- Returns:
71
- tuple: (files_to_process, current_hashes, deleted_files)
72
- - files_to_process: list[str] β€” paths that are new or changed
73
- - current_hashes: dict β€” {filepath: hash} for ALL current files
74
- - deleted files: list[str] β€” files that were deleted
75
- """
76
- if known_hashes is None:
77
- known_hashes = {}
78
- current_files = self.discover_files()
79
- files_to_process = []
80
- current_hashes = {}
81
- for file in current_files:
82
- file_hash = self.compute_hash(file)
83
- if file not in known_hashes or file_hash != known_hashes[file]:
84
- files_to_process.append(file)
85
- current_hashes[file] = file_hash
86
-
87
- deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
88
-
89
- return files_to_process, current_hashes, deleted_files
90
-
91
-
92
- # --- Test it ---
93
- if __name__ == "__main__":
94
- crawler = Crawler()
95
- files = crawler.discover_files()
96
- print(f"Found {len(files)} files:")
97
- for f in files:
98
- print(f" {f}")
99
-
100
- print("\n--- Checking for new/modified ---")
101
- to_process, hashes = crawler.get_new_and_modified()
102
- print(f"{len(to_process)} files to process")
 
 
 
 
 
 
 
1
+ # indexer/crawler.py
2
+
3
+ import os
4
+ import hashlib
5
+ import yaml
6
+
7
+
8
+ class Crawler:
9
+ """
10
+ Discovers files in configured directories and tracks which ones
11
+ are new or modified using SHA-256 hashing.
12
+ """
13
+
14
+ def __init__(self, config_path="config.yaml"):
15
+ """
16
+ Load the config file and store the settings as instance variables.
17
+ """
18
+ config_path = os.path.abspath(config_path)
19
+ with open(config_path, "r") as f:
20
+ config = yaml.safe_load(f)
21
+
22
+ config_dir = os.path.dirname(config_path)
23
+ self.watch_paths = [
24
+ path if os.path.isabs(path) else os.path.normpath(os.path.join(config_dir, path))
25
+ for path in config["watch_paths"]
26
+ ]
27
+ self.include_extensions = config["include_extensions"]
28
+ self.skip_directories = config["skip_directories"]
29
+ data_dir = config["data_dir"]
30
+ self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))
31
+
32
+ def discover_files(self):
33
+ """
34
+ Walk through all watch_paths recursively and collect every file
35
+ that matches include_extensions, skipping skip_directories.
36
+
37
+ Returns:
38
+ list[str] β€” list of absolute file paths
39
+ """
40
+ results=[]
41
+ for path in self.watch_paths:
42
+ for dirpath, dirnames, filenames in os.walk(path):
43
+ for filename in filenames:
44
+ if os.path.splitext(filename)[1] in self.include_extensions:
45
+ full_path = os.path.join(dirpath, filename)
46
+ results.append(full_path)
47
+ dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
48
+ return results
49
+
50
+
51
+ def compute_hash(self, filepath):
52
+ """
53
+ Compute the SHA-256 hash of a file's contents.
54
+
55
+ Args:
56
+ filepath (str) β€” absolute path to the file
57
+
58
+ Returns:
59
+ str β€” hex string of the SHA-256 hash
60
+ """
61
+ hasher = hashlib.sha256()
62
+ with open(filepath, "rb") as f:
63
+ while chunk := f.read(8192):
64
+ hasher.update(chunk)
65
+ return hasher.hexdigest()
66
+
67
+ def get_new_and_modified(self, known_hashes=None):
68
+ """
69
+ Compare discovered files against previously known hashes to find
70
+ which files are new or have been modified since last run.
71
+
72
+ Args:
73
+ known_hashes (dict) β€” {filepath: hash} from previous run
74
+ Pass None or {} on first run.
75
+
76
+ Returns:
77
+ tuple: (files_to_process, current_hashes, deleted_files)
78
+ - files_to_process: list[str] β€” paths that are new or changed
79
+ - current_hashes: dict β€” {filepath: hash} for ALL current files
80
+ - deleted files: list[str] β€” files that were deleted
81
+ """
82
+ if known_hashes is None:
83
+ known_hashes = {}
84
+ current_files = self.discover_files()
85
+ files_to_process = []
86
+ current_hashes = {}
87
+ for file in current_files:
88
+ file_hash = self.compute_hash(file)
89
+ if file not in known_hashes or file_hash != known_hashes[file]:
90
+ files_to_process.append(file)
91
+ current_hashes[file] = file_hash
92
+
93
+ deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
94
+
95
+ return files_to_process, current_hashes, deleted_files
96
+
97
+
98
+ # --- Test it ---
99
+ if __name__ == "__main__":
100
+ crawler = Crawler()
101
+ files = crawler.discover_files()
102
+ print(f"Found {len(files)} files:")
103
+ for f in files:
104
+ print(f" {f}")
105
+
106
+ print("\n--- Checking for new/modified ---")
107
+ to_process, hashes = crawler.get_new_and_modified()
108
+ print(f"{len(to_process)} files to process")