LOGOS-SPCW-Matroska / build_kb.py
GitHub Copilot
Feature: Add ARCHITECTURE.md and display it in new UI tab
aeaae89
import os
import json
import logging
from logos.connectors import get_connector
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def build_knowledge_base(source_dirs: list, output_file: str = "logos_knowledge_base.json"):
"""
Scans directories for images, applies OCR, and saves a JSON knowledge base.
"""
ocr = get_connector('ocr')
knowledge_base = []
total_files = 0
for directory in source_dirs:
if not os.path.exists(directory):
logging.warning(f"Directory not found: {directory}")
continue
logging.info(f"Scanning {directory}...")
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
file_path = os.path.join(root, file)
try:
logging.info(f"Processing {file}...")
result = ocr.extract_text(file_path)
entry = {
"filename": file,
"path": file_path,
"word_count": result['word_count'],
"full_text": result['full_text'],
"text_blocks": result['text_blocks'] # Keep blocks for spatial context if needed
}
knowledge_base.append(entry)
total_files += 1
except Exception as e:
logging.error(f"Failed to process {file}: {e}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(knowledge_base, f, indent=2, ensure_ascii=False)
logging.info(f"Knowledge Base built! Processed {total_files} images. Saved to {output_file}")
if __name__ == "__main__":
# Define source directories - Adjust these paths to where the user's notes actually are
# Using absolute paths based on previous context
PROJECT_ROOT = r"c:\Users\Nauti\Desktop\LOGOS CURSOR"
DIRS_TO_SCAN = [
os.path.join(PROJECT_ROOT, "LOGOS Screenshots"),
os.path.join(PROJECT_ROOT, "LOGOS Notes"),
os.path.join(PROJECT_ROOT, "LOGOS PRIME FUSE")
]
# Check if we should only scan a subset for testing first?
# Or just go for it. Given "external resources are sparse, my notes are streamlined",
# we want as much as possible.
build_knowledge_base(DIRS_TO_SCAN)