File size: 3,750 Bytes
9e7383e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | """
Custom Document Class - Replacing LlamaIndex Document
This provides the same value as LlamaIndex's Document object:
- Standardized data structure
- Metadata management
- Unique identification
- Easy serialization
No external dependencies required.
"""
import hashlib
from typing import Dict, Optional
from datetime import datetime
class Document:
"""
Custom Document class that standardizes data structure
Replaces LlamaIndex Document with same functionality:
- text: The main content
- metadata: URL, timestamp, category, source info
- doc_id: Unique identifier for deduplication
"""
def __init__(
self,
text: str,
metadata: Optional[Dict] = None,
doc_id: Optional[str] = None
):
"""
Initialize a Document
Args:
text: The document content
metadata: Dictionary of metadata (url, category, source, etc.)
doc_id: Optional unique ID (auto-generated if not provided)
"""
self.text = text
self.metadata = metadata or {}
self.doc_id = doc_id or self._generate_id()
def _generate_id(self) -> str:
"""
Generate unique document ID from URL or content hash
Returns:
Unique identifier string
"""
# Use URL if available for stable ID
if 'url' in self.metadata or 'link' in self.metadata:
url = self.metadata.get('url') or self.metadata.get('link')
return hashlib.md5(url.encode()).hexdigest()
# Fall back to content hash
content_hash = hashlib.md5(self.text[:500].encode()).hexdigest()
return f"doc_{content_hash}"
def to_dict(self) -> Dict:
"""
Convert Document to dictionary for serialization
Returns:
Dictionary representation
"""
return {
'text': self.text,
'metadata': self.metadata,
'doc_id': self.doc_id
}
@classmethod
def from_dict(cls, data: Dict) -> 'Document':
"""
Create Document from dictionary
Args:
data: Dictionary with text, metadata, doc_id
Returns:
Document instance
"""
return cls(
text=data.get('text', ''),
metadata=data.get('metadata', {}),
doc_id=data.get('doc_id')
)
def __repr__(self) -> str:
"""String representation for debugging"""
preview = self.text[:50] + "..." if len(self.text) > 50 else self.text
return f"Document(id={self.doc_id}, text='{preview}')"
def __len__(self) -> int:
"""Return text length"""
return len(self.text)
def create_document_from_rss_entry(
entry: Dict,
category: str,
source_feed: str
) -> Document:
"""
Helper function to create Document from RSS feed entry
Args:
entry: Dictionary from feedparser entry
category: News category
source_feed: RSS feed URL
Returns:
Document instance
"""
# Extract text content
text = entry.get('summary', '') or entry.get('description', '')
# Build metadata
metadata = {
'title': entry.get('title', '')[:200],
'url': entry.get('link', ''),
'link': entry.get('link', ''),
'published': entry.get('published', datetime.now().isoformat()),
'source': entry.get('source', {}).get('title', 'Unknown'),
'category': category,
'source_feed': source_feed,
'author': entry.get('author', ''),
}
# Create document
return Document(text=text, metadata=metadata)
|