ecomcp / src /core /document_loader.py
vinhnx90's picture
feat: Implement LlamaIndex integration with new core modules for knowledge base, document loading, vector search, and comprehensive documentation and tests.
108d8af
"""
Document Loading and Preparation for Knowledge Base
Handles:
- Loading documents from various sources
- Parsing and chunking
- Metadata extraction
"""
import os
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
import logging
from llama_index.core.schema import Document
logger = logging.getLogger(__name__)
class DocumentLoader:
"""Load and prepare documents for indexing"""
SUPPORTED_FORMATS = {'.md', '.txt', '.json', '.pdf'}
@staticmethod
def load_markdown_documents(directory: str) -> List[Document]:
"""
Load markdown documents from directory
Args:
directory: Path to markdown files
Returns:
List of Document objects
"""
documents = []
path = Path(directory)
if not path.exists():
logger.error(f"Directory not found: {directory}")
return documents
for md_file in path.glob("**/*.md"):
try:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
text=content,
metadata={
"source": str(md_file),
"type": "markdown",
"filename": md_file.name,
}
)
documents.append(doc)
logger.debug(f"Loaded: {md_file.name}")
except Exception as e:
logger.error(f"Error loading {md_file}: {e}")
logger.info(f"Loaded {len(documents)} markdown documents")
return documents
@staticmethod
def load_text_documents(directory: str) -> List[Document]:
"""
Load text documents from directory
Args:
directory: Path to text files
Returns:
List of Document objects
"""
documents = []
path = Path(directory)
if not path.exists():
logger.error(f"Directory not found: {directory}")
return documents
for txt_file in path.glob("**/*.txt"):
try:
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
text=content,
metadata={
"source": str(txt_file),
"type": "text",
"filename": txt_file.name,
}
)
documents.append(doc)
logger.debug(f"Loaded: {txt_file.name}")
except Exception as e:
logger.error(f"Error loading {txt_file}: {e}")
logger.info(f"Loaded {len(documents)} text documents")
return documents
@staticmethod
def load_json_documents(directory: str) -> List[Document]:
"""
Load JSON documents (product data, etc)
Args:
directory: Path to JSON files
Returns:
List of Document objects
"""
documents = []
path = Path(directory)
if not path.exists():
logger.error(f"Directory not found: {directory}")
return documents
for json_file in path.glob("**/*.json"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Convert JSON to readable text
if isinstance(data, dict):
content = json.dumps(data, indent=2)
elif isinstance(data, list):
content = json.dumps(data, indent=2)
else:
content = str(data)
doc = Document(
text=content,
metadata={
"source": str(json_file),
"type": "json",
"filename": json_file.name,
}
)
documents.append(doc)
logger.debug(f"Loaded: {json_file.name}")
except Exception as e:
logger.error(f"Error loading {json_file}: {e}")
logger.info(f"Loaded {len(documents)} JSON documents")
return documents
@staticmethod
def load_documents_from_urls(urls: List[str]) -> List[Document]:
"""
Load documents from URLs
Args:
urls: List of URLs to load
Returns:
List of Document objects
"""
documents = []
try:
from llama_index.readers.web import SimpleWebPageReader
for url in urls:
try:
reader = SimpleWebPageReader()
docs = reader.load_data([url])
for doc in docs:
doc.metadata["source"] = url
documents.append(doc)
logger.debug(f"Loaded: {url}")
except Exception as e:
logger.error(f"Error loading URL {url}: {e}")
logger.info(f"Loaded {len(documents)} documents from URLs")
except ImportError:
logger.warning("SimpleWebPageReader not available. Install llama-index-readers-web")
return documents
@staticmethod
def create_product_documents(products: List[Dict[str, Any]]) -> List[Document]:
"""
Create documents from product data
Args:
products: List of product dictionaries
Returns:
List of Document objects
"""
documents = []
for product in products:
# Format product info as readable text
text_parts = []
if 'name' in product:
text_parts.append(f"Product: {product['name']}")
if 'description' in product:
text_parts.append(f"Description: {product['description']}")
if 'price' in product:
text_parts.append(f"Price: {product['price']}")
if 'category' in product:
text_parts.append(f"Category: {product['category']}")
if 'features' in product:
features = product['features']
if isinstance(features, list):
text_parts.append("Features: " + ", ".join(features))
else:
text_parts.append(f"Features: {features}")
if 'tags' in product:
tags = product['tags']
if isinstance(tags, list):
text_parts.append("Tags: " + ", ".join(tags))
else:
text_parts.append(f"Tags: {tags}")
if text_parts:
doc = Document(
text="\n".join(text_parts),
metadata={
"type": "product",
"product_id": product.get('id', 'unknown'),
"product_name": product.get('name', 'unknown'),
**{k: v for k, v in product.items()
if k not in ['name', 'description', 'price', 'category', 'features', 'tags']}
}
)
documents.append(doc)
logger.info(f"Created {len(documents)} product documents")
return documents
@staticmethod
def load_all_documents(
docs_dir: Optional[str] = None,
products: Optional[List[Dict[str, Any]]] = None,
urls: Optional[List[str]] = None,
) -> List[Document]:
"""
Load documents from all sources
Args:
docs_dir: Directory containing documentation
products: List of products to index
urls: List of URLs to load
Returns:
Combined list of Document objects
"""
all_documents = []
# Load directory documents
if docs_dir and os.path.exists(docs_dir):
all_documents.extend(DocumentLoader.load_markdown_documents(docs_dir))
all_documents.extend(DocumentLoader.load_text_documents(docs_dir))
all_documents.extend(DocumentLoader.load_json_documents(docs_dir))
# Load product documents
if products:
all_documents.extend(DocumentLoader.create_product_documents(products))
# Load URL documents
if urls:
all_documents.extend(DocumentLoader.load_documents_from_urls(urls))
logger.info(f"Loaded total {len(all_documents)} documents")
return all_documents