Spaces:

MCP-1st-Birthday
/

ecomcp

Sleeping

ecomcp / src /core /document_loader.py

feat: Implement LlamaIndex integration with new core modules for knowledge base, document loading, vector search, and comprehensive documentation and tests.

108d8af 3 months ago

raw

history blame contribute delete

9.17 kB

	"""
	Document Loading and Preparation for Knowledge Base

	Handles:
	- Loading documents from various sources
	- Parsing and chunking
	- Metadata extraction
	"""

	import os
	from typing import List, Dict, Any, Optional
	from pathlib import Path
	import json
	import logging

	from llama_index.core.schema import Document

	logger = logging.getLogger(__name__)


	class DocumentLoader:
	"""Load and prepare documents for indexing"""

	SUPPORTED_FORMATS = {'.md', '.txt', '.json', '.pdf'}

	@staticmethod
	def load_markdown_documents(directory: str) -> List[Document]:
	"""
	Load markdown documents from directory

	Args:
	directory: Path to markdown files

	Returns:
	List of Document objects
	"""
	documents = []
	path = Path(directory)

	if not path.exists():
	logger.error(f"Directory not found: {directory}")
	return documents

	for md_file in path.glob("*/.md"):
	try:
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	doc = Document(
	text=content,
	metadata={
	"source": str(md_file),
	"type": "markdown",
	"filename": md_file.name,
	}
	)
	documents.append(doc)
	logger.debug(f"Loaded: {md_file.name}")

	except Exception as e:
	logger.error(f"Error loading {md_file}: {e}")

	logger.info(f"Loaded {len(documents)} markdown documents")
	return documents

	@staticmethod
	def load_text_documents(directory: str) -> List[Document]:
	"""
	Load text documents from directory

	Args:
	directory: Path to text files

	Returns:
	List of Document objects
	"""
	documents = []
	path = Path(directory)

	if not path.exists():
	logger.error(f"Directory not found: {directory}")
	return documents

	for txt_file in path.glob("*/.txt"):
	try:
	with open(txt_file, 'r', encoding='utf-8') as f:
	content = f.read()

	doc = Document(
	text=content,
	metadata={
	"source": str(txt_file),
	"type": "text",
	"filename": txt_file.name,
	}
	)
	documents.append(doc)
	logger.debug(f"Loaded: {txt_file.name}")

	except Exception as e:
	logger.error(f"Error loading {txt_file}: {e}")

	logger.info(f"Loaded {len(documents)} text documents")
	return documents

	@staticmethod
	def load_json_documents(directory: str) -> List[Document]:
	"""
	Load JSON documents (product data, etc)

	Args:
	directory: Path to JSON files

	Returns:
	List of Document objects
	"""
	documents = []
	path = Path(directory)

	if not path.exists():
	logger.error(f"Directory not found: {directory}")
	return documents

	for json_file in path.glob("*/.json"):
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Convert JSON to readable text
	if isinstance(data, dict):
	content = json.dumps(data, indent=2)
	elif isinstance(data, list):
	content = json.dumps(data, indent=2)
	else:
	content = str(data)

	doc = Document(
	text=content,
	metadata={
	"source": str(json_file),
	"type": "json",
	"filename": json_file.name,
	}
	)
	documents.append(doc)
	logger.debug(f"Loaded: {json_file.name}")

	except Exception as e:
	logger.error(f"Error loading {json_file}: {e}")

	logger.info(f"Loaded {len(documents)} JSON documents")
	return documents

	@staticmethod
	def load_documents_from_urls(urls: List[str]) -> List[Document]:
	"""
	Load documents from URLs

	Args:
	urls: List of URLs to load

	Returns:
	List of Document objects
	"""
	documents = []

	try:
	from llama_index.readers.web import SimpleWebPageReader

	for url in urls:
	try:
	reader = SimpleWebPageReader()
	docs = reader.load_data([url])
	for doc in docs:
	doc.metadata["source"] = url
	documents.append(doc)
	logger.debug(f"Loaded: {url}")

	except Exception as e:
	logger.error(f"Error loading URL {url}: {e}")

	logger.info(f"Loaded {len(documents)} documents from URLs")

	except ImportError:
	logger.warning("SimpleWebPageReader not available. Install llama-index-readers-web")

	return documents

	@staticmethod
	def create_product_documents(products: List[Dict[str, Any]]) -> List[Document]:
	"""
	Create documents from product data

	Args:
	products: List of product dictionaries

	Returns:
	List of Document objects
	"""
	documents = []

	for product in products:
	# Format product info as readable text
	text_parts = []

	if 'name' in product:
	text_parts.append(f"Product: {product['name']}")

	if 'description' in product:
	text_parts.append(f"Description: {product['description']}")

	if 'price' in product:
	text_parts.append(f"Price: {product['price']}")

	if 'category' in product:
	text_parts.append(f"Category: {product['category']}")

	if 'features' in product:
	features = product['features']
	if isinstance(features, list):
	text_parts.append("Features: " + ", ".join(features))
	else:
	text_parts.append(f"Features: {features}")

	if 'tags' in product:
	tags = product['tags']
	if isinstance(tags, list):
	text_parts.append("Tags: " + ", ".join(tags))
	else:
	text_parts.append(f"Tags: {tags}")

	if text_parts:
	doc = Document(
	text="\n".join(text_parts),
	metadata={
	"type": "product",
	"product_id": product.get('id', 'unknown'),
	"product_name": product.get('name', 'unknown'),
	**{k: v for k, v in product.items()
	if k not in ['name', 'description', 'price', 'category', 'features', 'tags']}
	}
	)
	documents.append(doc)

	logger.info(f"Created {len(documents)} product documents")
	return documents

	@staticmethod
	def load_all_documents(
	docs_dir: Optional[str] = None,
	products: Optional[List[Dict[str, Any]]] = None,
	urls: Optional[List[str]] = None,
	) -> List[Document]:
	"""
	Load documents from all sources

	Args:
	docs_dir: Directory containing documentation
	products: List of products to index
	urls: List of URLs to load

	Returns:
	Combined list of Document objects
	"""
	all_documents = []

	# Load directory documents
	if docs_dir and os.path.exists(docs_dir):
	all_documents.extend(DocumentLoader.load_markdown_documents(docs_dir))
	all_documents.extend(DocumentLoader.load_text_documents(docs_dir))
	all_documents.extend(DocumentLoader.load_json_documents(docs_dir))

	# Load product documents
	if products:
	all_documents.extend(DocumentLoader.create_product_documents(products))

	# Load URL documents
	if urls:
	all_documents.extend(DocumentLoader.load_documents_from_urls(urls))

	logger.info(f"Loaded total {len(all_documents)} documents")
	return all_documents