Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import logging | |
| from logos.connectors import get_connector | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def build_knowledge_base(source_dirs: list, output_file: str = "logos_knowledge_base.json"): | |
| """ | |
| Scans directories for images, applies OCR, and saves a JSON knowledge base. | |
| """ | |
| ocr = get_connector('ocr') | |
| knowledge_base = [] | |
| total_files = 0 | |
| for directory in source_dirs: | |
| if not os.path.exists(directory): | |
| logging.warning(f"Directory not found: {directory}") | |
| continue | |
| logging.info(f"Scanning {directory}...") | |
| for root, _, files in os.walk(directory): | |
| for file in files: | |
| if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')): | |
| file_path = os.path.join(root, file) | |
| try: | |
| logging.info(f"Processing {file}...") | |
| result = ocr.extract_text(file_path) | |
| entry = { | |
| "filename": file, | |
| "path": file_path, | |
| "word_count": result['word_count'], | |
| "full_text": result['full_text'], | |
| "text_blocks": result['text_blocks'] # Keep blocks for spatial context if needed | |
| } | |
| knowledge_base.append(entry) | |
| total_files += 1 | |
| except Exception as e: | |
| logging.error(f"Failed to process {file}: {e}") | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(knowledge_base, f, indent=2, ensure_ascii=False) | |
| logging.info(f"Knowledge Base built! Processed {total_files} images. Saved to {output_file}") | |
| if __name__ == "__main__": | |
| # Define source directories - Adjust these paths to where the user's notes actually are | |
| # Using absolute paths based on previous context | |
| PROJECT_ROOT = r"c:\Users\Nauti\Desktop\LOGOS CURSOR" | |
| DIRS_TO_SCAN = [ | |
| os.path.join(PROJECT_ROOT, "LOGOS Screenshots"), | |
| os.path.join(PROJECT_ROOT, "LOGOS Notes"), | |
| os.path.join(PROJECT_ROOT, "LOGOS PRIME FUSE") | |
| ] | |
| # Check if we should only scan a subset for testing first? | |
| # Or just go for it. Given "external resources are sparse, my notes are streamlined", | |
| # we want as much as possible. | |
| build_knowledge_base(DIRS_TO_SCAN) | |