Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

Semantic_File / indexer /crawler.py

JackSparrow89

Update indexer/crawler.py

775b78b verified about 1 month ago

raw

history blame contribute delete

3.8 kB

	# indexer/crawler.py

	import os
	import hashlib
	import yaml


	class Crawler:
	"""
	Discovers files in configured directories and tracks which ones
	are new or modified using SHA-256 hashing.
	"""

	def __init__(self, config_path="config.yaml"):
	"""
	Load the config file and store the settings as instance variables.
	"""
	config_path = os.path.abspath(config_path)
	with open(config_path, "r") as f:
	config = yaml.safe_load(f)

	config_dir = os.path.dirname(config_path)
	self.watch_paths = [
	path if os.path.isabs(path) else os.path.normpath(os.path.join(config_dir, path))
	for path in config["watch_paths"]
	]
	self.include_extensions = config["include_extensions"]
	self.skip_directories = config["skip_directories"]
	data_dir = config["data_dir"]
	self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))

	def discover_files(self):
	"""
	Walk through all watch_paths recursively and collect every file
	that matches include_extensions, skipping skip_directories.

	Returns:
	list[str] — list of absolute file paths
	"""
	results=[]
	for path in self.watch_paths:
	for dirpath, dirnames, filenames in os.walk(path):
	for filename in filenames:
	if os.path.splitext(filename)[1] in self.include_extensions:
	full_path = os.path.join(dirpath, filename)
	results.append(full_path)
	dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
	return results


	def compute_hash(self, filepath):
	"""
	Compute the SHA-256 hash of a file's contents.

	Args:
	filepath (str) — absolute path to the file

	Returns:
	str — hex string of the SHA-256 hash
	"""
	hasher = hashlib.sha256()
	with open(filepath, "rb") as f:
	while chunk := f.read(8192):
	hasher.update(chunk)
	return hasher.hexdigest()

	def get_new_and_modified(self, known_hashes=None):
	"""
	Compare discovered files against previously known hashes to find
	which files are new or have been modified since last run.

	Args:
	known_hashes (dict) — {filepath: hash} from previous run
	Pass None or {} on first run.

	Returns:
	tuple: (files_to_process, current_hashes, deleted_files)
	- files_to_process: list[str] — paths that are new or changed
	- current_hashes: dict — {filepath: hash} for ALL current files
	- deleted files: list[str] — files that were deleted
	"""
	if known_hashes is None:
	known_hashes = {}
	current_files = self.discover_files()
	files_to_process = []
	current_hashes = {}
	for file in current_files:
	file_hash = self.compute_hash(file)
	if file not in known_hashes or file_hash != known_hashes[file]:
	files_to_process.append(file)
	current_hashes[file] = file_hash

	deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())

	return files_to_process, current_hashes, deleted_files


	# --- Test it ---
	if __name__ == "__main__":
	crawler = Crawler()
	files = crawler.discover_files()
	print(f"Found {len(files)} files:")
	for f in files:
	print(f" {f}")

	print("\n--- Checking for new/modified ---")
	to_process, hashes = crawler.get_new_and_modified()
	print(f"{len(to_process)} files to process")