Spaces:

WORKWITHSHAFISK
/

segmentopulse-backend

Paused

segmentopulse-backend / app /services /document.py

SHAFI

feat: Hardcode LlamaIndex value with custom implementation

9e7383e about 1 month ago

3.75 kB

	"""
	Custom Document Class - Replacing LlamaIndex Document

	This provides the same value as LlamaIndex's Document object:
	- Standardized data structure
	- Metadata management
	- Unique identification
	- Easy serialization

	No external dependencies required.
	"""

	import hashlib
	from typing import Dict, Optional
	from datetime import datetime


	class Document:
	"""
	Custom Document class that standardizes data structure

	Replaces LlamaIndex Document with same functionality:
	- text: The main content
	- metadata: URL, timestamp, category, source info
	- doc_id: Unique identifier for deduplication
	"""

	def __init__(
	self,
	text: str,
	metadata: Optional[Dict] = None,
	doc_id: Optional[str] = None
	):
	"""
	Initialize a Document

	Args:
	text: The document content
	metadata: Dictionary of metadata (url, category, source, etc.)
	doc_id: Optional unique ID (auto-generated if not provided)
	"""
	self.text = text
	self.metadata = metadata or {}
	self.doc_id = doc_id or self._generate_id()

	def _generate_id(self) -> str:
	"""
	Generate unique document ID from URL or content hash

	Returns:
	Unique identifier string
	"""
	# Use URL if available for stable ID
	if 'url' in self.metadata or 'link' in self.metadata:
	url = self.metadata.get('url') or self.metadata.get('link')
	return hashlib.md5(url.encode()).hexdigest()

	# Fall back to content hash
	content_hash = hashlib.md5(self.text[:500].encode()).hexdigest()
	return f"doc_{content_hash}"

	def to_dict(self) -> Dict:
	"""
	Convert Document to dictionary for serialization

	Returns:
	Dictionary representation
	"""
	return {
	'text': self.text,
	'metadata': self.metadata,
	'doc_id': self.doc_id
	}

	@classmethod
	def from_dict(cls, data: Dict) -> 'Document':
	"""
	Create Document from dictionary

	Args:
	data: Dictionary with text, metadata, doc_id

	Returns:
	Document instance
	"""
	return cls(
	text=data.get('text', ''),
	metadata=data.get('metadata', {}),
	doc_id=data.get('doc_id')
	)

	def __repr__(self) -> str:
	"""String representation for debugging"""
	preview = self.text[:50] + "..." if len(self.text) > 50 else self.text
	return f"Document(id={self.doc_id}, text='{preview}')"

	def __len__(self) -> int:
	"""Return text length"""
	return len(self.text)


	def create_document_from_rss_entry(
	entry: Dict,
	category: str,
	source_feed: str
	) -> Document:
	"""
	Helper function to create Document from RSS feed entry

	Args:
	entry: Dictionary from feedparser entry
	category: News category
	source_feed: RSS feed URL

	Returns:
	Document instance
	"""
	# Extract text content
	text = entry.get('summary', '') or entry.get('description', '')

	# Build metadata
	metadata = {
	'title': entry.get('title', '')[:200],
	'url': entry.get('link', ''),
	'link': entry.get('link', ''),
	'published': entry.get('published', datetime.now().isoformat()),
	'source': entry.get('source', {}).get('title', 'Unknown'),
	'category': category,
	'source_feed': source_feed,
	'author': entry.get('author', ''),
	}

	# Create document
	return Document(text=text, metadata=metadata)