tungtung-chatbot / app /api /ingest.py
diminch's picture
Deploy chatbot version 1
eaf961b
from fastapi import APIRouter, UploadFile, File, Header, HTTPException
from app.core.config import INTERNAL_API_KEY
from app.rag.chunker import chunk_text
from app.rag.store import save_vector_store
import io
import json
try:
from pypdf import PdfReader
from docx import Document
except ImportError:
print("CRITICAL WARNING: Thiếu thư viện 'pypdf' hoặc 'python-docx'. Chức năng đọc file sẽ lỗi.")
router = APIRouter()
@router.post("/upload")
async def upload_document(
file: UploadFile = File(...),
x_api_key: str = Header(...)
):
if x_api_key != INTERNAL_API_KEY:
raise HTTPException(status_code=403, detail="Sai mã bảo mật (API Key).")
filename = file.filename.lower()
content_bytes = await file.read()
text_content = ""
try:
if filename.endswith(".pdf"):
reader = PdfReader(io.BytesIO(content_bytes))
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text_content += extracted + "\n"
elif filename.endswith(".docx"):
doc = Document(io.BytesIO(content_bytes))
text_content = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
elif filename.endswith(".json"):
data = json.load(io.BytesIO(content_bytes))
text_content = json.dumps(data, ensure_ascii=False, indent=2)
elif filename.endswith((".txt", ".md")):
text_content = content_bytes.decode("utf-8")
else:
return {"status": "error", "message": "Định dạng file không hỗ trợ. Chỉ nhận: PDF, DOCX, JSON, TXT."}
# Validate nội dung
if len(text_content.strip()) < 10:
return {"status": "error", "message": "File không có nội dung văn bản đọc được."}
# Pipeline xử lý
chunks = chunk_text(text_content)
save_vector_store(chunks)
return {
"status": "success",
"filename": file.filename,
"chunks_created": len(chunks),
"message": "Đã học xong tài liệu mới!"
}
except Exception as e:
print(f"Ingest Error: {e}")
raise HTTPException(status_code=500, detail=f"Lỗi xử lý file: {str(e)}")