Spaces:

ANXLOG
/

LOGOS-SPCW-Matroska

Runtime error

LOGOS-SPCW-Matroska / build_kb.py

GitHub Copilot

Feature: Add ARCHITECTURE.md and display it in new UI tab

aeaae89 about 1 month ago

2.56 kB


	import os
	import json
	import logging
	from logos.connectors import get_connector

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def build_knowledge_base(source_dirs: list, output_file: str = "logos_knowledge_base.json"):
	"""
	Scans directories for images, applies OCR, and saves a JSON knowledge base.
	"""
	ocr = get_connector('ocr')
	knowledge_base = []

	total_files = 0
	for directory in source_dirs:
	if not os.path.exists(directory):
	logging.warning(f"Directory not found: {directory}")
	continue

	logging.info(f"Scanning {directory}...")
	for root, _, files in os.walk(directory):
	for file in files:
	if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
	file_path = os.path.join(root, file)
	try:
	logging.info(f"Processing {file}...")
	result = ocr.extract_text(file_path)

	entry = {
	"filename": file,
	"path": file_path,
	"word_count": result['word_count'],
	"full_text": result['full_text'],
	"text_blocks": result['text_blocks'] # Keep blocks for spatial context if needed
	}
	knowledge_base.append(entry)
	total_files += 1
	except Exception as e:
	logging.error(f"Failed to process {file}: {e}")

	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

	logging.info(f"Knowledge Base built! Processed {total_files} images. Saved to {output_file}")

	if __name__ == "__main__":
	# Define source directories - Adjust these paths to where the user's notes actually are
	# Using absolute paths based on previous context
	PROJECT_ROOT = r"c:\Users\Nauti\Desktop\LOGOS CURSOR"
	DIRS_TO_SCAN = [
	os.path.join(PROJECT_ROOT, "LOGOS Screenshots"),
	os.path.join(PROJECT_ROOT, "LOGOS Notes"),
	os.path.join(PROJECT_ROOT, "LOGOS PRIME FUSE")
	]

	# Check if we should only scan a subset for testing first?
	# Or just go for it. Given "external resources are sparse, my notes are streamlined",
	# we want as much as possible.

	build_knowledge_base(DIRS_TO_SCAN)