""" Document Loading and Preparation for Knowledge Base Handles: - Loading documents from various sources - Parsing and chunking - Metadata extraction """ import os from typing import List, Dict, Any, Optional from pathlib import Path import json import logging from llama_index.core.schema import Document logger = logging.getLogger(__name__) class DocumentLoader: """Load and prepare documents for indexing""" SUPPORTED_FORMATS = {'.md', '.txt', '.json', '.pdf'} @staticmethod def load_markdown_documents(directory: str) -> List[Document]: """ Load markdown documents from directory Args: directory: Path to markdown files Returns: List of Document objects """ documents = [] path = Path(directory) if not path.exists(): logger.error(f"Directory not found: {directory}") return documents for md_file in path.glob("**/*.md"): try: with open(md_file, 'r', encoding='utf-8') as f: content = f.read() doc = Document( text=content, metadata={ "source": str(md_file), "type": "markdown", "filename": md_file.name, } ) documents.append(doc) logger.debug(f"Loaded: {md_file.name}") except Exception as e: logger.error(f"Error loading {md_file}: {e}") logger.info(f"Loaded {len(documents)} markdown documents") return documents @staticmethod def load_text_documents(directory: str) -> List[Document]: """ Load text documents from directory Args: directory: Path to text files Returns: List of Document objects """ documents = [] path = Path(directory) if not path.exists(): logger.error(f"Directory not found: {directory}") return documents for txt_file in path.glob("**/*.txt"): try: with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() doc = Document( text=content, metadata={ "source": str(txt_file), "type": "text", "filename": txt_file.name, } ) documents.append(doc) logger.debug(f"Loaded: {txt_file.name}") except Exception as e: logger.error(f"Error loading {txt_file}: {e}") logger.info(f"Loaded {len(documents)} text documents") return documents @staticmethod def load_json_documents(directory: str) -> List[Document]: """ Load JSON documents (product data, etc) Args: directory: Path to JSON files Returns: List of Document objects """ documents = [] path = Path(directory) if not path.exists(): logger.error(f"Directory not found: {directory}") return documents for json_file in path.glob("**/*.json"): try: with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) # Convert JSON to readable text if isinstance(data, dict): content = json.dumps(data, indent=2) elif isinstance(data, list): content = json.dumps(data, indent=2) else: content = str(data) doc = Document( text=content, metadata={ "source": str(json_file), "type": "json", "filename": json_file.name, } ) documents.append(doc) logger.debug(f"Loaded: {json_file.name}") except Exception as e: logger.error(f"Error loading {json_file}: {e}") logger.info(f"Loaded {len(documents)} JSON documents") return documents @staticmethod def load_documents_from_urls(urls: List[str]) -> List[Document]: """ Load documents from URLs Args: urls: List of URLs to load Returns: List of Document objects """ documents = [] try: from llama_index.readers.web import SimpleWebPageReader for url in urls: try: reader = SimpleWebPageReader() docs = reader.load_data([url]) for doc in docs: doc.metadata["source"] = url documents.append(doc) logger.debug(f"Loaded: {url}") except Exception as e: logger.error(f"Error loading URL {url}: {e}") logger.info(f"Loaded {len(documents)} documents from URLs") except ImportError: logger.warning("SimpleWebPageReader not available. Install llama-index-readers-web") return documents @staticmethod def create_product_documents(products: List[Dict[str, Any]]) -> List[Document]: """ Create documents from product data Args: products: List of product dictionaries Returns: List of Document objects """ documents = [] for product in products: # Format product info as readable text text_parts = [] if 'name' in product: text_parts.append(f"Product: {product['name']}") if 'description' in product: text_parts.append(f"Description: {product['description']}") if 'price' in product: text_parts.append(f"Price: {product['price']}") if 'category' in product: text_parts.append(f"Category: {product['category']}") if 'features' in product: features = product['features'] if isinstance(features, list): text_parts.append("Features: " + ", ".join(features)) else: text_parts.append(f"Features: {features}") if 'tags' in product: tags = product['tags'] if isinstance(tags, list): text_parts.append("Tags: " + ", ".join(tags)) else: text_parts.append(f"Tags: {tags}") if text_parts: doc = Document( text="\n".join(text_parts), metadata={ "type": "product", "product_id": product.get('id', 'unknown'), "product_name": product.get('name', 'unknown'), **{k: v for k, v in product.items() if k not in ['name', 'description', 'price', 'category', 'features', 'tags']} } ) documents.append(doc) logger.info(f"Created {len(documents)} product documents") return documents @staticmethod def load_all_documents( docs_dir: Optional[str] = None, products: Optional[List[Dict[str, Any]]] = None, urls: Optional[List[str]] = None, ) -> List[Document]: """ Load documents from all sources Args: docs_dir: Directory containing documentation products: List of products to index urls: List of URLs to load Returns: Combined list of Document objects """ all_documents = [] # Load directory documents if docs_dir and os.path.exists(docs_dir): all_documents.extend(DocumentLoader.load_markdown_documents(docs_dir)) all_documents.extend(DocumentLoader.load_text_documents(docs_dir)) all_documents.extend(DocumentLoader.load_json_documents(docs_dir)) # Load product documents if products: all_documents.extend(DocumentLoader.create_product_documents(products)) # Load URL documents if urls: all_documents.extend(DocumentLoader.load_documents_from_urls(urls)) logger.info(f"Loaded total {len(all_documents)} documents") return all_documents