Spaces:

huytrao123
/

Diary-chatbot

Sleeping

App Files Files Community

Diary-chatbot / src /streamlit_app /auto_sync.py

huytrao123

Upload 103 files

ced61cd verified 6 months ago

raw

history blame contribute delete

14.4 kB

	"""
	Auto-Sync Module for RAG Personal Diary Chatbot
	Handles automatic synchronization between database and vector store
	"""

	import os
	import sys
	import sqlite3
	import logging
	from datetime import datetime, timedelta
	from typing import Optional, Dict, Any, List
	import streamlit as st

	# Add paths for imports
	sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'Indexingstep'))
	sys.path.append(os.path.dirname(os.path.dirname(__file__)))

	try:
	from pipeline import DiaryIndexingPipeline
	from embedding_and_storing import DiaryEmbeddingAndStorage
	from run_user_indexing import UserIsolatedIndexingPipeline
	except ImportError as e:
	logging.error(f"Could not import indexing modules: {e}")
	DiaryIndexingPipeline = None
	DiaryEmbeddingAndStorage = None
	UserIsolatedIndexingPipeline = None

	class AutoSyncManager:
	"""Manages automatic synchronization between SQL database and vector database"""

	def __init__(self, user_id: int = 1):
	self.user_id = user_id
	# Use user-specific database path
	self.db_path = os.path.join(os.path.dirname(__file__), "backend", f"user_{user_id}_diary.db")
	self.vector_db_path = os.path.join(os.path.dirname(__file__), "..", "Indexingstep", f"user_{user_id}_vector_db")
	self.collection_name = f"user_{user_id}_diary_entries"

	# Load API key
	from dotenv import load_dotenv
	load_dotenv(os.path.join(os.path.dirname(__file__), '..', 'Indexingstep', '.env'))
	self.api_key = os.getenv("GOOGLE_API_KEY")

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	self.logger = logging.getLogger(__name__)

	def get_last_sync_time(self) -> Optional[datetime]:
	"""Get the last sync timestamp from a tracking file"""
	sync_file = os.path.join(os.path.dirname(__file__), f"last_sync_user_{self.user_id}.txt")
	try:
	if os.path.exists(sync_file):
	with open(sync_file, 'r') as f:
	timestamp_str = f.read().strip()
	return datetime.fromisoformat(timestamp_str)
	except Exception as e:
	self.logger.warning(f"Could not read last sync time: {e}")
	return None

	def update_last_sync_time(self, timestamp: datetime = None):
	"""Update the last sync timestamp"""
	if timestamp is None:
	timestamp = datetime.now()

	sync_file = os.path.join(os.path.dirname(__file__), f"last_sync_user_{self.user_id}.txt")
	try:
	with open(sync_file, 'w') as f:
	f.write(timestamp.isoformat())
	except Exception as e:
	self.logger.warning(f"Could not update last sync time: {e}")

	def get_changed_entries(self, since: Optional[datetime] = None) -> Dict[str, List]:
	"""Get entries that changed since the last sync"""
	if since is None:
	since = self.get_last_sync_time()
	if since is None:
	since = datetime.now() - timedelta(days=7) # Default to last week

	try:
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	# Get new/updated entries
	since_str = since.strftime('%Y-%m-%d %H:%M:%S')
	cursor.execute("""
	SELECT id, date, content, created_at, tags
	FROM diary_entries
	WHERE user_id = ? AND created_at > ?
	ORDER BY created_at DESC
	""", (self.user_id, since_str))

	new_entries = []
	for row in cursor.fetchall():
	new_entries.append({
	'id': row[0],
	'date': row[1],
	'content': row[2],
	'created_at': row[3],
	'tags': row[4] or ''
	})

	conn.close()

	return {
	'new_entries': new_entries,
	'deleted_entries': [] # TODO: Implement deletion tracking
	}

	except Exception as e:
	self.logger.error(f"Error getting changed entries: {e}")
	return {'new_entries': [], 'deleted_entries': []}

	def auto_index_new_entries(self, entries: List[Dict]) -> bool:
	"""Automatically index new entries"""
	if not entries or not self.api_key:
	return True

	try:
	# Run incremental indexing for new entries
	config = {
	"google_api_key": self.api_key,
	"db_path": self.db_path,
	"persist_directory": self.vector_db_path,
	"collection_name": self.collection_name,
	"embedding_model": "models/embedding-001",
	"chunk_size": 800,
	"chunk_overlap": 100,
	"batch_size": 50
	}

	# Get date range for new entries
	if entries:
	dates = [entry['date'] for entry in entries]
	start_date = min(dates)
	end_date = max(dates)

	# Use the USER-ISOLATED indexing approach
	if UserIsolatedIndexingPipeline:
	pipeline = UserIsolatedIndexingPipeline(
	user_id=self.user_id,
	google_api_key=config["google_api_key"],
	base_db_path=os.path.dirname(config["db_path"]),
	base_persist_directory=os.path.dirname(config["persist_directory"]),
	embedding_model=config["embedding_model"],
	chunk_size=config["chunk_size"],
	chunk_overlap=config["chunk_overlap"],
	batch_size=config["batch_size"]
	)

	# Run incremental indexing
	success = pipeline.run_incremental_indexing()

	if success:
	self.logger.info(f"Successfully indexed {len(entries)} new entries")
	return True
	else:
	self.logger.warning(f"Indexing completed with warnings")
	return False
	else:
	# Fallback to basic pipeline if UserIsolatedIndexingPipeline not available
	self.logger.warning("UserIsolatedIndexingPipeline not available, falling back to basic pipeline")
	if DiaryIndexingPipeline:
	pipeline = DiaryIndexingPipeline()
	pipeline.run()
	return True
	else:
	self.logger.error("No indexing pipeline available")
	return False

	except Exception as e:
	self.logger.error(f"Error auto-indexing new entries: {e}")
	return False

	return True

	def auto_remove_deleted_entries(self, deleted_entry_ids: List[int]) -> bool:
	"""Automatically remove deleted entries from vector database"""
	if not deleted_entry_ids or not self.api_key:
	return True

	try:
	embedding_storage = DiaryEmbeddingAndStorage(
	user_id=self.user_id,
	api_key=self.api_key,
	base_persist_directory=os.path.dirname(self.vector_db_path),
	embedding_model="models/embedding-001"
	)

	# Remove each deleted entry
	for entry_id in deleted_entry_ids:
	filter_criteria = {"entry_id": str(entry_id)}
	success = embedding_storage.delete_documents_by_metadata(filter_criteria)
	self.logger.info(f"Removed entry {entry_id} from vector DB: {success}")

	return True

	except Exception as e:
	self.logger.error(f"Error removing deleted entries: {e}")
	return False

	def run_sync(self) -> Dict[str, Any]:
	"""
	Run the complete synchronization process and return results.
	This is the main entry point to be called from the UI.
	"""
	self.logger.info(f"🚀 Starting sync for user {self.user_id}...")
	results = {'status': 'failed', 'indexed_count': 0, 'deleted_count': 0, 'error': None}

	try:
	# 1. Get changes from the database
	last_sync_time = self.get_last_sync_time()
	self.logger.info(f"Last sync time: {last_sync_time}")

	changed_data = self.get_changed_entries(last_sync_time)
	new_entries = changed_data.get('new_entries', [])
	# deleted_ids = changed_data.get('deleted_entries', []) # Deletion not implemented yet

	self.logger.info(f"Found {len(new_entries)} new entries to index.")

	if not new_entries:
	results['status'] = 'success'
	results['message'] = "No new entries to index."
	self.logger.info("✅ Sync finished: No new entries.")
	self.update_last_sync_time() # Update sync time even if no changes
	return results

	# 2. Index new entries
	index_success = self.auto_index_new_entries(new_entries)
	if not index_success:
	raise RuntimeError("Failed to index new entries.")

	results['indexed_count'] = len(new_entries)

	# 3. Update last sync time
	self.update_last_sync_time()

	results['status'] = 'success'
	results['message'] = f"Successfully indexed {len(new_entries)} new entries."
	self.logger.info(f"✅ Sync successful for user {self.user_id}.")

	except Exception as e:
	self.logger.error(f"❌ Sync failed for user {self.user_id}: {e}", exc_info=True)
	results['error'] = str(e)

	return results

	def perform_auto_sync(self) -> Dict[str, Any]:
	"""Perform automatic synchronization"""
	try:
	# Get changes since last sync
	changes = self.get_changed_entries()
	new_entries = changes['new_entries']
	deleted_entries = changes['deleted_entries']

	results = {
	'success': True,
	'new_entries_count': len(new_entries),
	'deleted_entries_count': len(deleted_entries),
	'errors': []
	}

	# Index new entries
	if new_entries:
	index_success = self.auto_index_new_entries(new_entries)
	if not index_success:
	results['errors'].append("Failed to index some new entries")

	# Remove deleted entries
	if deleted_entries:
	delete_success = self.auto_remove_deleted_entries(deleted_entries)
	if not delete_success:
	results['errors'].append("Failed to remove some deleted entries")

	# Update sync timestamp
	self.update_last_sync_time()

	results['success'] = len(results['errors']) == 0
	return results

	except Exception as e:
	self.logger.error(f"Auto-sync failed: {e}")
	return {
	'success': False,
	'new_entries_count': 0,
	'deleted_entries_count': 0,
	'errors': [str(e)]
	}

	# Streamlit helper functions
	def run_auto_sync(user_id: int = None) -> bool:
	"""Run auto-sync and show results in Streamlit"""
	if user_id is None:
	user_id = getattr(st.session_state, 'current_user_id', 1)

	try:
	# Simple approach: call the indexing script directly
	import subprocess

	script_path = os.path.join(
	os.path.dirname(__file__),
	'..',
	'Indexingstep',
	'run_user_indexing.py'
	)

	if not os.path.exists(script_path):
	return False

	# Get virtual environment python
	venv_python = os.path.join(
	os.path.dirname(__file__),
	'..',
	'..',
	'.venv',
	'Scripts',
	'python.exe'
	)

	python_cmd = venv_python if os.path.exists(venv_python) else sys.executable

	# Run incremental indexing for the user
	result = subprocess.run(
	[python_cmd, script_path, '--user-id', str(user_id)],
	cwd=os.path.dirname(script_path),
	capture_output=True,
	text=True,
	timeout=120 # 2 minutes timeout
	)

	if result.returncode == 0:
	return True
	else:
	return False

	except Exception as e:
	return False

	def run_auto_sync_legacy(user_id: int = None) -> bool:
	"""Legacy auto-sync using the AutoSyncManager class"""
	if user_id is None:
	user_id = getattr(st.session_state, 'current_user_id', 1)

	try:
	sync_manager = AutoSyncManager(user_id)
	results = sync_manager.perform_auto_sync()

	if results['success']:
	if results['new_entries_count'] > 0:
	st.success(f"✅ Auto-sync: {results['new_entries_count']} new entries indexed")
	return True
	else:
	st.warning(f"⚠️ Auto-sync completed with warnings: {', '.join(results['errors'])}")
	return True

	except Exception as e:
	st.warning(f"⚠️ Auto-sync failed: {str(e)}")
	return False

	def schedule_auto_sync():
	"""Schedule auto-sync to run periodically"""
	# This could be enhanced with background tasks or scheduled jobs
	# For now, we'll call it manually when entries are created/deleted
	pass