Spaces:

baveshraam
/

open-notebook

Sleeping

App Files Files Community

open-notebook / api /routers /embedding_rebuild.py

baveshraam

FIX: SurrealDB 2.0 migration syntax and Frontend/CORS link

f871fed about 1 month ago

raw

history blame contribute delete

7.03 kB

	from fastapi import APIRouter, HTTPException
	from loguru import logger
	from surreal_commands import get_command_status

	from api.command_service import CommandService
	from api.models import (
	RebuildProgress,
	RebuildRequest,
	RebuildResponse,
	RebuildStats,
	RebuildStatusResponse,
	)
	from open_notebook.database.repository import repo_query

	router = APIRouter()


	@router.post("/rebuild", response_model=RebuildResponse)
	async def start_rebuild(request: RebuildRequest):
	"""
	Start a background job to rebuild embeddings.

	- mode: "existing" (re-embed items with embeddings) or "all" (embed everything)
	- include_sources: Include sources in rebuild (default: true)
	- include_notes: Include notes in rebuild (default: true)
	- include_insights: Include insights in rebuild (default: true)

	Returns command ID to track progress and estimated item count.
	"""
	try:
	logger.info(f"Starting rebuild request: mode={request.mode}")

	# Import commands to ensure they're registered
	import commands.embedding_commands # noqa: F401

	# Estimate total items (quick count query)
	# This is a rough estimate before the command runs
	total_estimate = 0

	if request.include_sources:
	if request.mode == "existing":
	# Count sources with embeddings
	result = await repo_query(
	"""
	SELECT VALUE count(array::distinct(
	SELECT VALUE source.id
	FROM source_embedding
	WHERE embedding != none AND array::len(embedding) > 0
	)) as count FROM {}
	"""
	)
	else:
	# Count all sources with content
	result = await repo_query(
	"SELECT VALUE count() as count FROM source WHERE full_text != none GROUP ALL"
	)

	if result and isinstance(result[0], dict):
	total_estimate += result[0].get("count", 0)
	elif result:
	total_estimate += result[0] if isinstance(result[0], int) else 0

	if request.include_notes:
	if request.mode == "existing":
	result = await repo_query(
	"SELECT VALUE count() as count FROM note WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
	)
	else:
	result = await repo_query(
	"SELECT VALUE count() as count FROM note WHERE content != none GROUP ALL"
	)

	if result and isinstance(result[0], dict):
	total_estimate += result[0].get("count", 0)
	elif result:
	total_estimate += result[0] if isinstance(result[0], int) else 0

	if request.include_insights:
	if request.mode == "existing":
	result = await repo_query(
	"SELECT VALUE count() as count FROM source_insight WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
	)
	else:
	result = await repo_query(
	"SELECT VALUE count() as count FROM source_insight GROUP ALL"
	)

	if result and isinstance(result[0], dict):
	total_estimate += result[0].get("count", 0)
	elif result:
	total_estimate += result[0] if isinstance(result[0], int) else 0

	logger.info(f"Estimated {total_estimate} items to process")

	# Submit command
	command_id = await CommandService.submit_command_job(
	"open_notebook",
	"rebuild_embeddings",
	{
	"mode": request.mode,
	"include_sources": request.include_sources,
	"include_notes": request.include_notes,
	"include_insights": request.include_insights,
	},
	)

	logger.info(f"Submitted rebuild command: {command_id}")

	return RebuildResponse(
	command_id=command_id,
	total_items=total_estimate,
	message=f"Rebuild operation started. Estimated {total_estimate} items to process.",
	)

	except Exception as e:
	logger.error(f"Failed to start rebuild: {e}")
	logger.exception(e)
	raise HTTPException(
	status_code=500, detail=f"Failed to start rebuild operation: {str(e)}"
	)


	@router.get("/rebuild/{command_id}/status", response_model=RebuildStatusResponse)
	async def get_rebuild_status(command_id: str):
	"""
	Get the status of a rebuild operation.

	Returns:
	- status: queued, running, completed, failed
	- progress: processed count, total count, percentage
	- stats: breakdown by type (sources, notes, insights, failed)
	- timestamps: started_at, completed_at
	"""
	try:
	# Get command status from surreal_commands
	status = await get_command_status(command_id)

	if not status:
	raise HTTPException(status_code=404, detail="Rebuild command not found")

	# Build response based on status
	response = RebuildStatusResponse(
	command_id=command_id,
	status=status.status,
	)

	# Extract metadata from command result
	if status.result and isinstance(status.result, dict):
	result = status.result

	# Build progress info
	if "total_items" in result and "processed_items" in result:
	total = result["total_items"]
	processed = result["processed_items"]
	response.progress = RebuildProgress(
	processed=processed,
	total=total,
	percentage=round((processed / total * 100) if total > 0 else 0, 2),
	)

	# Build stats
	response.stats = RebuildStats(
	sources=result.get("sources_processed", 0),
	notes=result.get("notes_processed", 0),
	insights=result.get("insights_processed", 0),
	failed=result.get("failed_items", 0),
	)

	# Add timestamps
	if hasattr(status, "created") and status.created:
	response.started_at = str(status.created)
	if hasattr(status, "updated") and status.updated:
	response.completed_at = str(status.updated)

	# Add error message if failed
	if status.status == "failed" and status.result and isinstance(status.result, dict):
	response.error_message = status.result.get(
	"error_message", "Unknown error"
	)

	return response

	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Failed to get rebuild status: {e}")
	logger.exception(e)
	raise HTTPException(
	status_code=500, detail=f"Failed to get rebuild status: {str(e)}"
	)