Youngger9765
fix: Python 3.9 compatibility for HuggingFace Spaces
b830e05
"""API endpoints for database statistics"""
from typing import List
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
router = APIRouter(prefix="/api/stats", tags=["stats"])
class DocumentStats(BaseModel):
id: int
title: str
pages: int
bytes: int
chunks_count: int
created_at: str
class DatabaseStats(BaseModel):
total_datasources: int
total_documents: int
total_chunks: int
total_embeddings: int
total_bytes: int
documents: List[DocumentStats]
@router.get("/", response_model=DatabaseStats)
async def get_database_stats(db: AsyncSession = Depends(get_db)):
"""
Get database statistics including counts and document details
Returns:
DatabaseStats with all statistics
"""
# Get total counts
datasources_count = await db.scalar(text("SELECT COUNT(*) FROM datasources"))
documents_count = await db.scalar(text("SELECT COUNT(*) FROM documents"))
chunks_count = await db.scalar(text("SELECT COUNT(*) FROM chunks"))
embeddings_count = await db.scalar(text("SELECT COUNT(*) FROM embeddings"))
total_bytes = await db.scalar(text("SELECT COALESCE(SUM(bytes), 0) FROM documents"))
# Get document details with chunk counts
query = text("""
SELECT
d.id,
d.title,
d.pages,
d.bytes,
d.created_at,
COUNT(c.id) as chunks_count
FROM documents d
LEFT JOIN chunks c ON d.id = c.doc_id
GROUP BY d.id, d.title, d.pages, d.bytes, d.created_at
ORDER BY d.created_at DESC
""")
result = await db.execute(query)
rows = result.fetchall()
documents = [
DocumentStats(
id=row.id,
title=row.title,
pages=row.pages or 0,
bytes=row.bytes or 0,
chunks_count=row.chunks_count or 0,
created_at=str(row.created_at),
)
for row in rows
]
return DatabaseStats(
total_datasources=datasources_count or 0,
total_documents=documents_count or 0,
total_chunks=chunks_count or 0,
total_embeddings=embeddings_count or 0,
total_bytes=total_bytes or 0,
documents=documents,
)
class ChunkDetail(BaseModel):
id: int
ordinal: int
text: str
text_length: int
document_title: str
@router.get("/chunks/{doc_id}", response_model=List[ChunkDetail])
async def get_document_chunks(doc_id: int, db: AsyncSession = Depends(get_db)):
"""
Get all chunks for a specific document
Args:
doc_id: Document ID
Returns:
List of chunks with details
"""
query = text("""
SELECT
c.id,
c.ordinal,
c.text,
LENGTH(c.text) as text_length,
d.title as document_title
FROM chunks c
JOIN documents d ON c.doc_id = d.id
WHERE c.doc_id = :doc_id
ORDER BY c.ordinal
""")
result = await db.execute(query, {"doc_id": doc_id})
rows = result.fetchall()
return [
ChunkDetail(
id=row.id,
ordinal=row.ordinal,
text=row.text,
text_length=row.text_length,
document_title=row.document_title,
)
for row in rows
]
@router.delete("/documents/{doc_id}")
async def delete_document(doc_id: int, db: AsyncSession = Depends(get_db)):
"""
Delete a document and all its related chunks and embeddings
This will cascade delete:
- All chunks associated with the document
- All embeddings associated with those chunks
- The document itself
Args:
doc_id: Document ID to delete
Returns:
Success message with deletion counts
"""
# Get chunk count before deletion
chunks_result = await db.execute(
text("SELECT COUNT(*) FROM chunks WHERE doc_id = :doc_id"), {"doc_id": doc_id}
)
chunks_count = chunks_result.scalar() or 0
# Get embeddings count before deletion
embeddings_result = await db.execute(
text("""
SELECT COUNT(*) FROM embeddings
WHERE chunk_id IN (SELECT id FROM chunks WHERE doc_id = :doc_id)
"""),
{"doc_id": doc_id},
)
embeddings_count = embeddings_result.scalar() or 0
# Delete embeddings first (FK constraint)
await db.execute(
text("""
DELETE FROM embeddings
WHERE chunk_id IN (SELECT id FROM chunks WHERE doc_id = :doc_id)
"""),
{"doc_id": doc_id},
)
# Delete chunks
await db.execute(text("DELETE FROM chunks WHERE doc_id = :doc_id"), {"doc_id": doc_id})
# Delete document
await db.execute(text("DELETE FROM documents WHERE id = :doc_id"), {"doc_id": doc_id})
await db.commit()
return {
"success": True,
"message": f"成功刪除文檔 ID {doc_id}",
"deleted": {"document": 1, "chunks": chunks_count, "embeddings": embeddings_count},
}