Spaces:

appQQQ
/

ai-chatbot

Sleeping

App Files Files Community

ai-chatbot / app /api /documents.py

appQQQ

chore: upload app/api/documents.py

a0cc0b1 verified 19 days ago

Raw

History Blame Contribute Delete

3.52 kB

	"""文档管理 API. 阶段 2: 完整 upload / list / delete / get / chunks."""
	from __future__ import annotations

	import logging

	from fastapi import APIRouter, File, HTTPException, Query, UploadFile

	from app.core.errors import AppError, DocumentNotFoundError
	from app.models import db
	from app.models.schemas import DocumentChunk, IngestResult
	from app.services.ingestion import (
	delete_document,
	get_document,
	ingest_bytes,
	list_documents,
	)

	logger = logging.getLogger(__name__)
	router = APIRouter(prefix="/documents", tags=["documents"])


	@router.get("")
	async def list_docs(
	limit: int = Query(default=100, ge=1, le=500),
	offset: int = Query(default=0, ge=0),
	) -> dict:
	docs = list_documents(limit=limit)
	return {
	"documents": docs[offset : offset + limit],
	"total": len(docs),
	}


	@router.get("/{doc_id}")
	async def get_doc(doc_id: str) -> dict:
	doc = get_document(doc_id)
	if doc is None:
	raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
	return doc


	@router.get("/{doc_id}/chunks")
	async def list_chunks(
	doc_id: str,
	limit: int = Query(default=200, ge=1, le=1000),
	offset: int = Query(default=0, ge=0),
	) -> dict:
	"""获取文档的 chunks 列表, 用于前端预览拆分结果.

	返回顺序: 按 chunk_index ASC. 包含 text / token_count / page_no / heading / context_prefix.
	"""
	doc = get_document(doc_id)
	if doc is None:
	raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
	rows = db.chunk_get_by_doc(doc_id, limit=limit + offset)
	# 截取 offset
	rows = rows[offset : offset + limit]
	return {
	"doc_id": doc_id,
	"chunks": [DocumentChunk(**r).model_dump() for r in rows],
	"total": doc.get("chunk_count", len(rows)),
	"returned": len(rows),
	}


	@router.post("/upload", response_model=IngestResult)
	async def upload(file: UploadFile = File(...)) -> IngestResult:
	"""上传并摄入文档. 支持 PDF / DOCX / 图片.

	流程: 读字节 -> 摄入编排器 (内部 SHA256 查重 + 解析 + 分块 + 向量化 + 入库)
	"""
	if file.filename is None:
	raise HTTPException(status_code=400, detail="filename missing")

	# 限制大小 (50MB; HF Space free 16GB RAM 下够用)
	MAX_BYTES = 50 * 1024 * 1024
	content = await file.read(MAX_BYTES + 1)
	if len(content) > MAX_BYTES:
	raise HTTPException(
	status_code=413,
	detail=f"File too large (> {MAX_BYTES // 1024 // 1024}MB). "
	"Consider chunked upload via S3/R2 (TODO 阶段 5).",
	)

	try:
	result = await ingest_bytes(content=content, filename=file.filename)
	except AppError:
	# 业务异常 (e.g. IngestionFailedError) 直接抛, 走全局 AppError handler
	# 保留 .code / .message / .detail (含 last_traceback 等诊断信息)
	logger.exception("Upload failed (AppError)")
	raise
	except Exception as e: # noqa: BLE001
	# 兜底: 未知异常, 包成 HTTP 502
	logger.exception("Upload failed (unexpected)")
	raise HTTPException(status_code=502, detail=f"Ingest failed: {e}") from e

	return result


	@router.delete("/{doc_id}")
	async def delete_doc(doc_id: str) -> dict:
	ok = await delete_document(doc_id)
	if not ok:
	raise DocumentNotFoundError(f"Document {doc_id} not found", code="doc_not_found")
	return {"doc_id": doc_id, "deleted": True}