File size: 3,750 Bytes
9e7383e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Custom Document Class - Replacing LlamaIndex Document

This provides the same value as LlamaIndex's Document object:
- Standardized data structure
- Metadata management
- Unique identification
- Easy serialization

No external dependencies required.
"""

import hashlib
from typing import Dict, Optional
from datetime import datetime


class Document:
    """
    Custom Document class that standardizes data structure
    
    Replaces LlamaIndex Document with same functionality:
    - text: The main content
    - metadata: URL, timestamp, category, source info
    - doc_id: Unique identifier for deduplication
    """
    
    def __init__(
        self,
        text: str,
        metadata: Optional[Dict] = None,
        doc_id: Optional[str] = None
    ):
        """
        Initialize a Document
        
        Args:
            text: The document content
            metadata: Dictionary of metadata (url, category, source, etc.)
            doc_id: Optional unique ID (auto-generated if not provided)
        """
        self.text = text
        self.metadata = metadata or {}
        self.doc_id = doc_id or self._generate_id()
    
    def _generate_id(self) -> str:
        """
        Generate unique document ID from URL or content hash
        
        Returns:
            Unique identifier string
        """
        # Use URL if available for stable ID
        if 'url' in self.metadata or 'link' in self.metadata:
            url = self.metadata.get('url') or self.metadata.get('link')
            return hashlib.md5(url.encode()).hexdigest()
        
        # Fall back to content hash
        content_hash = hashlib.md5(self.text[:500].encode()).hexdigest()
        return f"doc_{content_hash}"
    
    def to_dict(self) -> Dict:
        """
        Convert Document to dictionary for serialization
        
        Returns:
            Dictionary representation
        """
        return {
            'text': self.text,
            'metadata': self.metadata,
            'doc_id': self.doc_id
        }
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'Document':
        """
        Create Document from dictionary
        
        Args:
            data: Dictionary with text, metadata, doc_id
            
        Returns:
            Document instance
        """
        return cls(
            text=data.get('text', ''),
            metadata=data.get('metadata', {}),
            doc_id=data.get('doc_id')
        )
    
    def __repr__(self) -> str:
        """String representation for debugging"""
        preview = self.text[:50] + "..." if len(self.text) > 50 else self.text
        return f"Document(id={self.doc_id}, text='{preview}')"
    
    def __len__(self) -> int:
        """Return text length"""
        return len(self.text)


def create_document_from_rss_entry(
    entry: Dict,
    category: str,
    source_feed: str
) -> Document:
    """
    Helper function to create Document from RSS feed entry
    
    Args:
        entry: Dictionary from feedparser entry
        category: News category
        source_feed: RSS feed URL
        
    Returns:
        Document instance
    """
    # Extract text content
    text = entry.get('summary', '') or entry.get('description', '')
    
    # Build metadata
    metadata = {
        'title': entry.get('title', '')[:200],
        'url': entry.get('link', ''),
        'link': entry.get('link', ''),
        'published': entry.get('published', datetime.now().isoformat()),
        'source': entry.get('source', {}).get('title', 'Unknown'),
        'category': category,
        'source_feed': source_feed,
        'author': entry.get('author', ''),
    }
    
    # Create document
    return Document(text=text, metadata=metadata)