Spaces:

ANXLOG
/

LOGOS-SPCW-Matroska

Runtime error

File size: 2,561 Bytes

aeaae89


import os
import json
import logging
from logos.connectors import get_connector

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def build_knowledge_base(source_dirs: list, output_file: str = "logos_knowledge_base.json"):
    """
    Scans directories for images, applies OCR, and saves a JSON knowledge base.
    """
    ocr = get_connector('ocr')
    knowledge_base = []
    
    total_files = 0
    for directory in source_dirs:
        if not os.path.exists(directory):
            logging.warning(f"Directory not found: {directory}")
            continue
            
        logging.info(f"Scanning {directory}...")
        for root, _, files in os.walk(directory):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
                    file_path = os.path.join(root, file)
                    try:
                        logging.info(f"Processing {file}...")
                        result = ocr.extract_text(file_path)
                        
                        entry = {
                            "filename": file,
                            "path": file_path,
                            "word_count": result['word_count'],
                            "full_text": result['full_text'],
                            "text_blocks": result['text_blocks'] # Keep blocks for spatial context if needed
                        }
                        knowledge_base.append(entry)
                        total_files += 1
                    except Exception as e:
                        logging.error(f"Failed to process {file}: {e}")

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(knowledge_base, f, indent=2, ensure_ascii=False)
    
    logging.info(f"Knowledge Base built! Processed {total_files} images. Saved to {output_file}")

if __name__ == "__main__":
    # Define source directories - Adjust these paths to where the user's notes actually are
    # Using absolute paths based on previous context
    PROJECT_ROOT = r"c:\Users\Nauti\Desktop\LOGOS CURSOR"
    DIRS_TO_SCAN = [
        os.path.join(PROJECT_ROOT, "LOGOS Screenshots"),
        os.path.join(PROJECT_ROOT, "LOGOS Notes"),
        os.path.join(PROJECT_ROOT, "LOGOS PRIME FUSE") 
    ]
    
    # Check if we should only scan a subset for testing first? 
    # Or just go for it. Given "external resources are sparse, my notes are streamlined", 
    # we want as much as possible.
    
    build_knowledge_base(DIRS_TO_SCAN)