Spaces:

MCP-1st-Birthday
/

ecomcp

Sleeping

File size: 9,167 Bytes

108d8af

"""
Document Loading and Preparation for Knowledge Base

Handles:
- Loading documents from various sources
- Parsing and chunking
- Metadata extraction
"""

import os
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
import logging

from llama_index.core.schema import Document

logger = logging.getLogger(__name__)


class DocumentLoader:
    """Load and prepare documents for indexing"""
    
    SUPPORTED_FORMATS = {'.md', '.txt', '.json', '.pdf'}
    
    @staticmethod
    def load_markdown_documents(directory: str) -> List[Document]:
        """
        Load markdown documents from directory
        
        Args:
            directory: Path to markdown files
            
        Returns:
            List of Document objects
        """
        documents = []
        path = Path(directory)
        
        if not path.exists():
            logger.error(f"Directory not found: {directory}")
            return documents
        
        for md_file in path.glob("**/*.md"):
            try:
                with open(md_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                doc = Document(
                    text=content,
                    metadata={
                        "source": str(md_file),
                        "type": "markdown",
                        "filename": md_file.name,
                    }
                )
                documents.append(doc)
                logger.debug(f"Loaded: {md_file.name}")
                
            except Exception as e:
                logger.error(f"Error loading {md_file}: {e}")
        
        logger.info(f"Loaded {len(documents)} markdown documents")
        return documents
    
    @staticmethod
    def load_text_documents(directory: str) -> List[Document]:
        """
        Load text documents from directory
        
        Args:
            directory: Path to text files
            
        Returns:
            List of Document objects
        """
        documents = []
        path = Path(directory)
        
        if not path.exists():
            logger.error(f"Directory not found: {directory}")
            return documents
        
        for txt_file in path.glob("**/*.txt"):
            try:
                with open(txt_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                doc = Document(
                    text=content,
                    metadata={
                        "source": str(txt_file),
                        "type": "text",
                        "filename": txt_file.name,
                    }
                )
                documents.append(doc)
                logger.debug(f"Loaded: {txt_file.name}")
                
            except Exception as e:
                logger.error(f"Error loading {txt_file}: {e}")
        
        logger.info(f"Loaded {len(documents)} text documents")
        return documents
    
    @staticmethod
    def load_json_documents(directory: str) -> List[Document]:
        """
        Load JSON documents (product data, etc)
        
        Args:
            directory: Path to JSON files
            
        Returns:
            List of Document objects
        """
        documents = []
        path = Path(directory)
        
        if not path.exists():
            logger.error(f"Directory not found: {directory}")
            return documents
        
        for json_file in path.glob("**/*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Convert JSON to readable text
                if isinstance(data, dict):
                    content = json.dumps(data, indent=2)
                elif isinstance(data, list):
                    content = json.dumps(data, indent=2)
                else:
                    content = str(data)
                
                doc = Document(
                    text=content,
                    metadata={
                        "source": str(json_file),
                        "type": "json",
                        "filename": json_file.name,
                    }
                )
                documents.append(doc)
                logger.debug(f"Loaded: {json_file.name}")
                
            except Exception as e:
                logger.error(f"Error loading {json_file}: {e}")
        
        logger.info(f"Loaded {len(documents)} JSON documents")
        return documents
    
    @staticmethod
    def load_documents_from_urls(urls: List[str]) -> List[Document]:
        """
        Load documents from URLs
        
        Args:
            urls: List of URLs to load
            
        Returns:
            List of Document objects
        """
        documents = []
        
        try:
            from llama_index.readers.web import SimpleWebPageReader
            
            for url in urls:
                try:
                    reader = SimpleWebPageReader()
                    docs = reader.load_data([url])
                    for doc in docs:
                        doc.metadata["source"] = url
                        documents.append(doc)
                    logger.debug(f"Loaded: {url}")
                    
                except Exception as e:
                    logger.error(f"Error loading URL {url}: {e}")
            
            logger.info(f"Loaded {len(documents)} documents from URLs")
            
        except ImportError:
            logger.warning("SimpleWebPageReader not available. Install llama-index-readers-web")
        
        return documents
    
    @staticmethod
    def create_product_documents(products: List[Dict[str, Any]]) -> List[Document]:
        """
        Create documents from product data
        
        Args:
            products: List of product dictionaries
            
        Returns:
            List of Document objects
        """
        documents = []
        
        for product in products:
            # Format product info as readable text
            text_parts = []
            
            if 'name' in product:
                text_parts.append(f"Product: {product['name']}")
            
            if 'description' in product:
                text_parts.append(f"Description: {product['description']}")
            
            if 'price' in product:
                text_parts.append(f"Price: {product['price']}")
            
            if 'category' in product:
                text_parts.append(f"Category: {product['category']}")
            
            if 'features' in product:
                features = product['features']
                if isinstance(features, list):
                    text_parts.append("Features: " + ", ".join(features))
                else:
                    text_parts.append(f"Features: {features}")
            
            if 'tags' in product:
                tags = product['tags']
                if isinstance(tags, list):
                    text_parts.append("Tags: " + ", ".join(tags))
                else:
                    text_parts.append(f"Tags: {tags}")
            
            if text_parts:
                doc = Document(
                    text="\n".join(text_parts),
                    metadata={
                        "type": "product",
                        "product_id": product.get('id', 'unknown'),
                        "product_name": product.get('name', 'unknown'),
                        **{k: v for k, v in product.items() 
                           if k not in ['name', 'description', 'price', 'category', 'features', 'tags']}
                    }
                )
                documents.append(doc)
        
        logger.info(f"Created {len(documents)} product documents")
        return documents
    
    @staticmethod
    def load_all_documents(
        docs_dir: Optional[str] = None,
        products: Optional[List[Dict[str, Any]]] = None,
        urls: Optional[List[str]] = None,
    ) -> List[Document]:
        """
        Load documents from all sources
        
        Args:
            docs_dir: Directory containing documentation
            products: List of products to index
            urls: List of URLs to load
            
        Returns:
            Combined list of Document objects
        """
        all_documents = []
        
        # Load directory documents
        if docs_dir and os.path.exists(docs_dir):
            all_documents.extend(DocumentLoader.load_markdown_documents(docs_dir))
            all_documents.extend(DocumentLoader.load_text_documents(docs_dir))
            all_documents.extend(DocumentLoader.load_json_documents(docs_dir))
        
        # Load product documents
        if products:
            all_documents.extend(DocumentLoader.create_product_documents(products))
        
        # Load URL documents
        if urls:
            all_documents.extend(DocumentLoader.load_documents_from_urls(urls))
        
        logger.info(f"Loaded total {len(all_documents)} documents")
        return all_documents