import os import json import logging from logos.connectors import get_connector # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def build_knowledge_base(source_dirs: list, output_file: str = "logos_knowledge_base.json"): """ Scans directories for images, applies OCR, and saves a JSON knowledge base. """ ocr = get_connector('ocr') knowledge_base = [] total_files = 0 for directory in source_dirs: if not os.path.exists(directory): logging.warning(f"Directory not found: {directory}") continue logging.info(f"Scanning {directory}...") for root, _, files in os.walk(directory): for file in files: if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')): file_path = os.path.join(root, file) try: logging.info(f"Processing {file}...") result = ocr.extract_text(file_path) entry = { "filename": file, "path": file_path, "word_count": result['word_count'], "full_text": result['full_text'], "text_blocks": result['text_blocks'] # Keep blocks for spatial context if needed } knowledge_base.append(entry) total_files += 1 except Exception as e: logging.error(f"Failed to process {file}: {e}") with open(output_file, 'w', encoding='utf-8') as f: json.dump(knowledge_base, f, indent=2, ensure_ascii=False) logging.info(f"Knowledge Base built! Processed {total_files} images. Saved to {output_file}") if __name__ == "__main__": # Define source directories - Adjust these paths to where the user's notes actually are # Using absolute paths based on previous context PROJECT_ROOT = r"c:\Users\Nauti\Desktop\LOGOS CURSOR" DIRS_TO_SCAN = [ os.path.join(PROJECT_ROOT, "LOGOS Screenshots"), os.path.join(PROJECT_ROOT, "LOGOS Notes"), os.path.join(PROJECT_ROOT, "LOGOS PRIME FUSE") ] # Check if we should only scan a subset for testing first? # Or just go for it. Given "external resources are sparse, my notes are streamlined", # we want as much as possible. build_knowledge_base(DIRS_TO_SCAN)