Spaces:

Athena1621
/

translation_app

Configuration error

translation_app / backend /routers /sources.py

feat: Introduce new backend architecture with notebooks, sources, chat, and CLaRa models, alongside database schema and updated deployment scripts, while removing old frontend, deployment files, and previous backend components.

88f8604 4 months ago

raw

history blame contribute delete

9.16 kB

	"""
	Antigravity Notebook - Sources Router
	API endpoints for source management (PDF upload, URL scraping, text ingestion).
	"""

	from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, Form
	from sqlalchemy.orm import Session
	from typing import List
	from uuid import UUID

	from backend.database import get_db, Notebook, Source, LatentTensor
	from backend.models.schemas import SourceResponse, SourceURL, SourceText, IngestionStatus
	from backend.models.clara import get_clara_model
	from backend.services.storage import get_storage_service
	from backend.services.ingestion import get_ingestion_service

	router = APIRouter(prefix="/sources", tags=["sources"])


	@router.post("/notebooks/{notebook_id}/sources/upload", response_model=SourceResponse)
	async def upload_pdf(
	notebook_id: UUID,
	file: UploadFile = File(...),
	db: Session = Depends(get_db)
	):
	"""Upload and ingest a PDF file"""

	# Verify notebook exists
	notebook = db.query(Notebook).filter(Notebook.id == notebook_id).first()
	if not notebook:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Notebook {notebook_id} not found"
	)

	# Validate file type
	if not file.filename.lower().endswith('.pdf'):
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail="Only PDF files are supported"
	)

	try:
	# Read file content
	file_content = await file.read()

	# Get services
	clara = get_clara_model()
	storage = get_storage_service()
	ingestion = get_ingestion_service(clara, storage)

	# Ingest PDF
	source = ingestion.ingest_pdf(
	db=db,
	notebook_id=notebook_id,
	file_content=file_content,
	filename=file.filename
	)

	# Get tensor stats
	tensors = db.query(LatentTensor).filter(LatentTensor.source_id == source.id).all()
	tensor_count = len(tensors)
	total_tokens = sum(t.token_count for t in tensors)

	return SourceResponse(
	id=source.id,
	notebook_id=source.notebook_id,
	source_type=source.source_type,
	filename=source.filename,
	url=source.url,
	created_at=source.created_at,
	metadata=source.metadata or {},
	tensor_count=tensor_count,
	total_tokens=total_tokens
	)

	except ValueError as e:
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=str(e)
	)
	except Exception as e:
	raise HTTPException(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	detail=f"Failed to process PDF: {str(e)}"
	)


	@router.post("/notebooks/{notebook_id}/sources/url", response_model=SourceResponse)
	def add_url_source(
	notebook_id: UUID,
	url_data: SourceURL,
	db: Session = Depends(get_db)
	):
	"""Add a URL as a source"""

	# Verify notebook exists
	notebook = db.query(Notebook).filter(Notebook.id == notebook_id).first()
	if not notebook:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Notebook {notebook_id} not found"
	)

	try:
	# Get services
	clara = get_clara_model()
	storage = get_storage_service()
	ingestion = get_ingestion_service(clara, storage)

	# Ingest URL
	source = ingestion.ingest_url(
	db=db,
	notebook_id=notebook_id,
	url=str(url_data.url),
	title=url_data.title
	)

	# Get tensor stats
	tensors = db.query(LatentTensor).filter(LatentTensor.source_id == source.id).all()
	tensor_count = len(tensors)
	total_tokens = sum(t.token_count for t in tensors)

	return SourceResponse(
	id=source.id,
	notebook_id=source.notebook_id,
	source_type=source.source_type,
	filename=source.filename,
	url=source.url,
	created_at=source.created_at,
	metadata=source.metadata or {},
	tensor_count=tensor_count,
	total_tokens=total_tokens
	)

	except ValueError as e:
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=str(e)
	)
	except Exception as e:
	raise HTTPException(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	detail=f"Failed to process URL: {str(e)}"
	)


	@router.post("/notebooks/{notebook_id}/sources/text", response_model=SourceResponse)
	def add_text_source(
	notebook_id: UUID,
	text_data: SourceText,
	db: Session = Depends(get_db)
	):
	"""Add plain text as a source"""

	# Verify notebook exists
	notebook = db.query(Notebook).filter(Notebook.id == notebook_id).first()
	if not notebook:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Notebook {notebook_id} not found"
	)

	try:
	# Get services
	clara = get_clara_model()
	storage = get_storage_service()
	ingestion = get_ingestion_service(clara, storage)

	# Ingest text
	source = ingestion.ingest_text(
	db=db,
	notebook_id=notebook_id,
	content=text_data.content,
	title=text_data.title
	)

	# Get tensor stats
	tensors = db.query(LatentTensor).filter(LatentTensor.source_id == source.id).all()
	tensor_count = len(tensors)
	total_tokens = sum(t.token_count for t in tensors)

	return SourceResponse(
	id=source.id,
	notebook_id=source.notebook_id,
	source_type=source.source_type,
	filename=source.filename,
	url=source.url,
	created_at=source.created_at,
	metadata=source.metadata or {},
	tensor_count=tensor_count,
	total_tokens=total_tokens
	)

	except ValueError as e:
	raise HTTPException(
	status_code=status.HTTP_400_BAD_REQUEST,
	detail=str(e)
	)
	except Exception as e:
	raise HTTPException(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	detail=f"Failed to process text: {str(e)}"
	)


	@router.get("/notebooks/{notebook_id}/sources", response_model=List[SourceResponse])
	def list_sources(notebook_id: UUID, db: Session = Depends(get_db)):
	"""List all sources in a notebook"""

	# Verify notebook exists
	notebook = db.query(Notebook).filter(Notebook.id == notebook_id).first()
	if not notebook:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Notebook {notebook_id} not found"
	)

	sources = db.query(Source).filter(Source.notebook_id == notebook_id).all()

	response = []
	for source in sources:
	tensors = db.query(LatentTensor).filter(LatentTensor.source_id == source.id).all()
	tensor_count = len(tensors)
	total_tokens = sum(t.token_count for t in tensors)

	response.append(SourceResponse(
	id=source.id,
	notebook_id=source.notebook_id,
	source_type=source.source_type,
	filename=source.filename,
	url=source.url,
	created_at=source.created_at,
	metadata=source.metadata or {},
	tensor_count=tensor_count,
	total_tokens=total_tokens
	))

	return response


	@router.get("/sources/{source_id}", response_model=SourceResponse)
	def get_source(source_id: UUID, db: Session = Depends(get_db)):
	"""Get source details"""

	source = db.query(Source).filter(Source.id == source_id).first()
	if not source:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Source {source_id} not found"
	)

	tensors = db.query(LatentTensor).filter(LatentTensor.source_id == source_id).all()
	tensor_count = len(tensors)
	total_tokens = sum(t.token_count for t in tensors)

	return SourceResponse(
	id=source.id,
	notebook_id=source.notebook_id,
	source_type=source.source_type,
	filename=source.filename,
	url=source.url,
	created_at=source.created_at,
	metadata=source.metadata or {},
	tensor_count=tensor_count,
	total_tokens=total_tokens
	)


	@router.delete("/sources/{source_id}", status_code=status.HTTP_204_NO_CONTENT)
	def delete_source(source_id: UUID, db: Session = Depends(get_db)):
	"""Delete a source and all associated tensors"""

	source = db.query(Source).filter(Source.id == source_id).first()
	if not source:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail=f"Source {source_id} not found"
	)

	# Delete tensors from filesystem
	storage = get_storage_service()
	storage.delete_source_tensors(db, source_id, source.notebook_id)

	# Delete source (cascades to latent_tensors in DB)
	db.delete(source)
	db.commit()

	return None