Spaces:
Sleeping
Sleeping
| """ | |
| Document Loading and Preparation for Knowledge Base | |
| Handles: | |
| - Loading documents from various sources | |
| - Parsing and chunking | |
| - Metadata extraction | |
| """ | |
| import os | |
| from typing import List, Dict, Any, Optional | |
| from pathlib import Path | |
| import json | |
| import logging | |
| from llama_index.core.schema import Document | |
| logger = logging.getLogger(__name__) | |
| class DocumentLoader: | |
| """Load and prepare documents for indexing""" | |
| SUPPORTED_FORMATS = {'.md', '.txt', '.json', '.pdf'} | |
| def load_markdown_documents(directory: str) -> List[Document]: | |
| """ | |
| Load markdown documents from directory | |
| Args: | |
| directory: Path to markdown files | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| path = Path(directory) | |
| if not path.exists(): | |
| logger.error(f"Directory not found: {directory}") | |
| return documents | |
| for md_file in path.glob("**/*.md"): | |
| try: | |
| with open(md_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| doc = Document( | |
| text=content, | |
| metadata={ | |
| "source": str(md_file), | |
| "type": "markdown", | |
| "filename": md_file.name, | |
| } | |
| ) | |
| documents.append(doc) | |
| logger.debug(f"Loaded: {md_file.name}") | |
| except Exception as e: | |
| logger.error(f"Error loading {md_file}: {e}") | |
| logger.info(f"Loaded {len(documents)} markdown documents") | |
| return documents | |
| def load_text_documents(directory: str) -> List[Document]: | |
| """ | |
| Load text documents from directory | |
| Args: | |
| directory: Path to text files | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| path = Path(directory) | |
| if not path.exists(): | |
| logger.error(f"Directory not found: {directory}") | |
| return documents | |
| for txt_file in path.glob("**/*.txt"): | |
| try: | |
| with open(txt_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| doc = Document( | |
| text=content, | |
| metadata={ | |
| "source": str(txt_file), | |
| "type": "text", | |
| "filename": txt_file.name, | |
| } | |
| ) | |
| documents.append(doc) | |
| logger.debug(f"Loaded: {txt_file.name}") | |
| except Exception as e: | |
| logger.error(f"Error loading {txt_file}: {e}") | |
| logger.info(f"Loaded {len(documents)} text documents") | |
| return documents | |
| def load_json_documents(directory: str) -> List[Document]: | |
| """ | |
| Load JSON documents (product data, etc) | |
| Args: | |
| directory: Path to JSON files | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| path = Path(directory) | |
| if not path.exists(): | |
| logger.error(f"Directory not found: {directory}") | |
| return documents | |
| for json_file in path.glob("**/*.json"): | |
| try: | |
| with open(json_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Convert JSON to readable text | |
| if isinstance(data, dict): | |
| content = json.dumps(data, indent=2) | |
| elif isinstance(data, list): | |
| content = json.dumps(data, indent=2) | |
| else: | |
| content = str(data) | |
| doc = Document( | |
| text=content, | |
| metadata={ | |
| "source": str(json_file), | |
| "type": "json", | |
| "filename": json_file.name, | |
| } | |
| ) | |
| documents.append(doc) | |
| logger.debug(f"Loaded: {json_file.name}") | |
| except Exception as e: | |
| logger.error(f"Error loading {json_file}: {e}") | |
| logger.info(f"Loaded {len(documents)} JSON documents") | |
| return documents | |
| def load_documents_from_urls(urls: List[str]) -> List[Document]: | |
| """ | |
| Load documents from URLs | |
| Args: | |
| urls: List of URLs to load | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| try: | |
| from llama_index.readers.web import SimpleWebPageReader | |
| for url in urls: | |
| try: | |
| reader = SimpleWebPageReader() | |
| docs = reader.load_data([url]) | |
| for doc in docs: | |
| doc.metadata["source"] = url | |
| documents.append(doc) | |
| logger.debug(f"Loaded: {url}") | |
| except Exception as e: | |
| logger.error(f"Error loading URL {url}: {e}") | |
| logger.info(f"Loaded {len(documents)} documents from URLs") | |
| except ImportError: | |
| logger.warning("SimpleWebPageReader not available. Install llama-index-readers-web") | |
| return documents | |
| def create_product_documents(products: List[Dict[str, Any]]) -> List[Document]: | |
| """ | |
| Create documents from product data | |
| Args: | |
| products: List of product dictionaries | |
| Returns: | |
| List of Document objects | |
| """ | |
| documents = [] | |
| for product in products: | |
| # Format product info as readable text | |
| text_parts = [] | |
| if 'name' in product: | |
| text_parts.append(f"Product: {product['name']}") | |
| if 'description' in product: | |
| text_parts.append(f"Description: {product['description']}") | |
| if 'price' in product: | |
| text_parts.append(f"Price: {product['price']}") | |
| if 'category' in product: | |
| text_parts.append(f"Category: {product['category']}") | |
| if 'features' in product: | |
| features = product['features'] | |
| if isinstance(features, list): | |
| text_parts.append("Features: " + ", ".join(features)) | |
| else: | |
| text_parts.append(f"Features: {features}") | |
| if 'tags' in product: | |
| tags = product['tags'] | |
| if isinstance(tags, list): | |
| text_parts.append("Tags: " + ", ".join(tags)) | |
| else: | |
| text_parts.append(f"Tags: {tags}") | |
| if text_parts: | |
| doc = Document( | |
| text="\n".join(text_parts), | |
| metadata={ | |
| "type": "product", | |
| "product_id": product.get('id', 'unknown'), | |
| "product_name": product.get('name', 'unknown'), | |
| **{k: v for k, v in product.items() | |
| if k not in ['name', 'description', 'price', 'category', 'features', 'tags']} | |
| } | |
| ) | |
| documents.append(doc) | |
| logger.info(f"Created {len(documents)} product documents") | |
| return documents | |
| def load_all_documents( | |
| docs_dir: Optional[str] = None, | |
| products: Optional[List[Dict[str, Any]]] = None, | |
| urls: Optional[List[str]] = None, | |
| ) -> List[Document]: | |
| """ | |
| Load documents from all sources | |
| Args: | |
| docs_dir: Directory containing documentation | |
| products: List of products to index | |
| urls: List of URLs to load | |
| Returns: | |
| Combined list of Document objects | |
| """ | |
| all_documents = [] | |
| # Load directory documents | |
| if docs_dir and os.path.exists(docs_dir): | |
| all_documents.extend(DocumentLoader.load_markdown_documents(docs_dir)) | |
| all_documents.extend(DocumentLoader.load_text_documents(docs_dir)) | |
| all_documents.extend(DocumentLoader.load_json_documents(docs_dir)) | |
| # Load product documents | |
| if products: | |
| all_documents.extend(DocumentLoader.create_product_documents(products)) | |
| # Load URL documents | |
| if urls: | |
| all_documents.extend(DocumentLoader.load_documents_from_urls(urls)) | |
| logger.info(f"Loaded total {len(all_documents)} documents") | |
| return all_documents | |