Spaces:
Sleeping
Sleeping
| from fastapi import APIRouter, UploadFile, File, Header, HTTPException | |
| from app.core.config import INTERNAL_API_KEY | |
| from app.rag.chunker import chunk_text | |
| from app.rag.store import save_vector_store | |
| import io | |
| import json | |
| try: | |
| from pypdf import PdfReader | |
| from docx import Document | |
| except ImportError: | |
| print("CRITICAL WARNING: Thiếu thư viện 'pypdf' hoặc 'python-docx'. Chức năng đọc file sẽ lỗi.") | |
| router = APIRouter() | |
| async def upload_document( | |
| file: UploadFile = File(...), | |
| x_api_key: str = Header(...) | |
| ): | |
| if x_api_key != INTERNAL_API_KEY: | |
| raise HTTPException(status_code=403, detail="Sai mã bảo mật (API Key).") | |
| filename = file.filename.lower() | |
| content_bytes = await file.read() | |
| text_content = "" | |
| try: | |
| if filename.endswith(".pdf"): | |
| reader = PdfReader(io.BytesIO(content_bytes)) | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text_content += extracted + "\n" | |
| elif filename.endswith(".docx"): | |
| doc = Document(io.BytesIO(content_bytes)) | |
| text_content = "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) | |
| elif filename.endswith(".json"): | |
| data = json.load(io.BytesIO(content_bytes)) | |
| text_content = json.dumps(data, ensure_ascii=False, indent=2) | |
| elif filename.endswith((".txt", ".md")): | |
| text_content = content_bytes.decode("utf-8") | |
| else: | |
| return {"status": "error", "message": "Định dạng file không hỗ trợ. Chỉ nhận: PDF, DOCX, JSON, TXT."} | |
| # Validate nội dung | |
| if len(text_content.strip()) < 10: | |
| return {"status": "error", "message": "File không có nội dung văn bản đọc được."} | |
| # Pipeline xử lý | |
| chunks = chunk_text(text_content) | |
| save_vector_store(chunks) | |
| return { | |
| "status": "success", | |
| "filename": file.filename, | |
| "chunks_created": len(chunks), | |
| "message": "Đã học xong tài liệu mới!" | |
| } | |
| except Exception as e: | |
| print(f"Ingest Error: {e}") | |
| raise HTTPException(status_code=500, detail=f"Lỗi xử lý file: {str(e)}") |