Spaces:

ANXLOG
/

LOGOS-SPCW-Matroska

Runtime error

GitHub Copilot

Upgrade to Protocol 26: Gödel-Zeta Datastore & Recursive Manifold

66b508d about 2 months ago

4.49 kB

	"""
	logos/indexer.py - The Codebase Aggregator
	Protocol 4: Automated Context indexing

	This script creates a monolithic "Dump File" of the entire project to facilitate
	LLM ingestion and token indexing.

	Workflow:
	1. Traverses the project root.
	2. Filters for "Signal" (Source code, Docs) vs "Noise" (Venv, Data, Cache).
	3. Aggregates content into a structured markdown file.
	4. Provides Token Estimation.

	Output: 'project_context_dump.md'
	"""

	import os
	import sys
	import os
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	import pathspec
	from logos.connectors import get_connector

	# Configuration
	PROJECT_ROOT = "."
	OUTPUT_FILE = "project_context_dump.md"

	# Whitelist: Only these extensions contain "Logic/Knowledge"
	INCLUDE_EXTENSIONS = {
	'.py', # Source Logic
	'.md', # Documentation & Knowledge Base
	'.txt', # Requirements / Notes
	'.bat', # Orchestration
	'.sh', # Scripts
	'.yaml', # Config
	'.json' # Config (Be careful with data files)
	}

	# Blacklist: Always ignore these directories
	IGNORE_DIRS = {
	'.git',
	'.venv',
	'__pycache__',
	'node_modules',
	'_archive', # Legacy code
	'LOGOS Screenshots', # Binary Data
	'LOGOS Notes', # Binary Data (Ingested via ingest_knowledge.py)
	'.gemini', # Agent memory
	'artifacts' # Agent outputs
	}

	# Max file size to include (avoid dumping huge data files by accident)
	MAX_FILE_SIZE = 100 * 1024 # 100KB

	def load_gitignore(root):
	"""Load .gitignore to respect project exclusions."""
	gitignore = os.path.join(root, ".gitignore")
	if os.path.exists(gitignore):
	with open(gitignore, 'r') as f:
	return pathspec.PathSpec.from_lines('gitwildmatch', f)
	return None

	def estimate_tokens(text):
	"""Rough estimation: 4 chars ~= 1 token"""
	return len(text) // 4

	def main():
	print(f"--- LOGOS Indexing Protocol ---")
	print(f"Root: {os.path.abspath(PROJECT_ROOT)}")

	spec = load_gitignore(PROJECT_ROOT)

	total_files = 0
	total_chars = 0

	with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
	# Header
	out.write(f"# LOGOS Project Context Dump\n")
	out.write(f"Generated: {os.path.abspath(OUTPUT_FILE)}\n\n")

	for root, dirs, files in os.walk(PROJECT_ROOT):
	# 1. Filtering Directories
	dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]

	for file in files:
	ext = os.path.splitext(file)[1].lower()

	# 2. Filtering Extensions
	if ext not in INCLUDE_EXTENSIONS:
	continue

	filepath = os.path.join(root, file)
	relpath = os.path.relpath(filepath, PROJECT_ROOT)

	# 3. Filtering GitIgnore
	if spec and spec.match_file(relpath):
	continue

	# 4. Filtering Size
	try:
	size = os.path.getsize(filepath)
	if size > MAX_FILE_SIZE:
	print(f"[SKIP] Too large: {relpath} ({size/1024:.1f}KB)")
	continue

	# 5. Ingestion
	with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
	content = f.read()

	# Format
	out.write(f"================================================================================\n")
	out.write(f"FILE: {relpath}\n")
	out.write(f"================================================================================\n")
	out.write(f"```{ext[1:]}\n")
	out.write(content)
	out.write(f"\n```\n\n")

	total_files += 1
	total_chars += len(content)
	print(f"[INDEX] Added: {relpath}")

	except Exception as e:
	print(f"[WARN] Could not read {relpath}: {e}")

	# Stats
	tokens = total_chars // 4
	print(f"\n--- Indexing Complete ---")
	print(f"Files Processed: {total_files}")
	print(f"Total Characters: {total_chars:,}")
	print(f"Estimated Tokens: {tokens:,}")
	print(f"Output: {OUTPUT_FILE}")

	if __name__ == "__main__":
	main()