ai-chatbot / app /api /documents.py
appQQQ's picture
chore: upload app/api/documents.py
a0cc0b1 verified
Raw
History Blame Contribute Delete
3.52 kB
"""文档管理 API. 阶段 2: 完整 upload / list / delete / get / chunks."""
from __future__ import annotations
import logging
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
from app.core.errors import AppError, DocumentNotFoundError
from app.models import db
from app.models.schemas import DocumentChunk, IngestResult
from app.services.ingestion import (
delete_document,
get_document,
ingest_bytes,
list_documents,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/documents", tags=["documents"])
@router.get("")
async def list_docs(
limit: int = Query(default=100, ge=1, le=500),
offset: int = Query(default=0, ge=0),
) -> dict:
docs = list_documents(limit=limit)
return {
"documents": docs[offset : offset + limit],
"total": len(docs),
}
@router.get("/{doc_id}")
async def get_doc(doc_id: str) -> dict:
doc = get_document(doc_id)
if doc is None:
raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
return doc
@router.get("/{doc_id}/chunks")
async def list_chunks(
doc_id: str,
limit: int = Query(default=200, ge=1, le=1000),
offset: int = Query(default=0, ge=0),
) -> dict:
"""获取文档的 chunks 列表, 用于前端预览拆分结果.
返回顺序: 按 chunk_index ASC. 包含 text / token_count / page_no / heading / context_prefix.
"""
doc = get_document(doc_id)
if doc is None:
raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
rows = db.chunk_get_by_doc(doc_id, limit=limit + offset)
# 截取 offset
rows = rows[offset : offset + limit]
return {
"doc_id": doc_id,
"chunks": [DocumentChunk(**r).model_dump() for r in rows],
"total": doc.get("chunk_count", len(rows)),
"returned": len(rows),
}
@router.post("/upload", response_model=IngestResult)
async def upload(file: UploadFile = File(...)) -> IngestResult:
"""上传并摄入文档. 支持 PDF / DOCX / 图片.
流程: 读字节 -> 摄入编排器 (内部 SHA256 查重 + 解析 + 分块 + 向量化 + 入库)
"""
if file.filename is None:
raise HTTPException(status_code=400, detail="filename missing")
# 限制大小 (50MB; HF Space free 16GB RAM 下够用)
MAX_BYTES = 50 * 1024 * 1024
content = await file.read(MAX_BYTES + 1)
if len(content) > MAX_BYTES:
raise HTTPException(
status_code=413,
detail=f"File too large (> {MAX_BYTES // 1024 // 1024}MB). "
"Consider chunked upload via S3/R2 (TODO 阶段 5).",
)
try:
result = await ingest_bytes(content=content, filename=file.filename)
except AppError:
# 业务异常 (e.g. IngestionFailedError) 直接抛, 走全局 AppError handler
# 保留 .code / .message / .detail (含 last_traceback 等诊断信息)
logger.exception("Upload failed (AppError)")
raise
except Exception as e: # noqa: BLE001
# 兜底: 未知异常, 包成 HTTP 502
logger.exception("Upload failed (unexpected)")
raise HTTPException(status_code=502, detail=f"Ingest failed: {e}") from e
return result
@router.delete("/{doc_id}")
async def delete_doc(doc_id: str) -> dict:
ok = await delete_document(doc_id)
if not ok:
raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
return {"doc_id": doc_id, "deleted": True}