Paramjit Singh commited on
Commit
8d1a352
Β·
unverified Β·
2 Parent(s): 0232eea6f5def0

Merge pull request #137 from saurabhhhcodes/perf/background-document-processing-116

Browse files
README.md CHANGED
@@ -437,8 +437,9 @@ docker compose up --build
437
  | `POST` | `/api/v1/auth/register` | ❌ | Create a new user account |
438
  | `POST` | `/api/v1/auth/login` | ❌ | Login and receive JWT token |
439
  | `GET` | `/api/v1/auth/me` | βœ… | Get current user profile |
440
- | `POST` | `/api/v1/documents/upload` | βœ… | Upload PDF/DOCX and trigger indexing |
441
  | `GET` | `/api/v1/documents` | βœ… | List all documents for current user |
 
442
  | `DELETE` | `/api/v1/documents/{id}` | βœ… | Delete a document and its vector data |
443
  | `POST` | `/api/v1/chat/ask/stream` | βœ… | Ask a question (SSE streaming response) |
444
  | `GET` | `/api/v1/chat/history/{doc_id}` | βœ… | Get chat history for a document |
 
437
  | `POST` | `/api/v1/auth/register` | ❌ | Create a new user account |
438
  | `POST` | `/api/v1/auth/login` | ❌ | Login and receive JWT token |
439
  | `GET` | `/api/v1/auth/me` | βœ… | Get current user profile |
440
+ | `POST` | `/api/v1/documents/upload` | βœ… | Upload PDF/DOCX and enqueue background indexing (`202 Accepted`) |
441
  | `GET` | `/api/v1/documents` | βœ… | List all documents for current user |
442
+ | `GET` | `/api/v1/documents/{id}/status` | βœ… | Poll background document processing status |
443
  | `DELETE` | `/api/v1/documents/{id}` | βœ… | Delete a document and its vector data |
444
  | `POST` | `/api/v1/chat/ask/stream` | βœ… | Ask a question (SSE streaming response) |
445
  | `GET` | `/api/v1/chat/history/{doc_id}` | βœ… | Get chat history for a document |
backend/app/routes/documents.py CHANGED
@@ -16,7 +16,7 @@ from sqlalchemy.orm import Session
16
 
17
  from app.database import get_db
18
  from app.models import User, Document
19
- from app.schemas import DocumentResponse, DocumentListResponse
20
  from app.auth import get_current_user
21
  from app.config import get_settings
22
  from app.rag.chunker import chunk_document, get_page_count
@@ -191,7 +191,7 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
191
  db.close()
192
 
193
 
194
- @router.post("/upload", response_model=DocumentResponse, status_code=status.HTTP_201_CREATED)
195
  async def upload_document(
196
  background_tasks: BackgroundTasks,
197
  file: UploadFile = File(...),
@@ -199,11 +199,13 @@ async def upload_document(
199
  db: Session = Depends(get_db),
200
  ):
201
  """
202
- Upload a document for RAG processing.
203
 
204
  Validates the uploaded file (extension, size, MIME type, integrity),
205
  saves it to the user's directory, creates a database record with status
206
- 'pending', and schedules a background task for chunking and embedding.
 
 
207
 
208
  Args:
209
  background_tasks: FastAPI BackgroundTasks instance to run the ingestion process asynchronously.
@@ -272,6 +274,30 @@ async def upload_document(
272
  return DocumentResponse.model_validate(document)
273
 
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  @router.get("/", response_model=DocumentListResponse)
276
  def list_documents(
277
  page: int = Query(1, ge=1),
 
16
 
17
  from app.database import get_db
18
  from app.models import User, Document
19
+ from app.schemas import DocumentResponse, DocumentListResponse, DocumentStatusResponse
20
  from app.auth import get_current_user
21
  from app.config import get_settings
22
  from app.rag.chunker import chunk_document, get_page_count
 
191
  db.close()
192
 
193
 
194
+ @router.post("/upload", response_model=DocumentResponse, status_code=status.HTTP_202_ACCEPTED)
195
  async def upload_document(
196
  background_tasks: BackgroundTasks,
197
  file: UploadFile = File(...),
 
199
  db: Session = Depends(get_db),
200
  ):
201
  """
202
+ Upload a document and enqueue RAG processing.
203
 
204
  Validates the uploaded file (extension, size, MIME type, integrity),
205
  saves it to the user's directory, creates a database record with status
206
+ 'pending', schedules a background task for chunking and embedding, and
207
+ returns 202 Accepted immediately so large documents do not block the API
208
+ request while embeddings are generated.
209
 
210
  Args:
211
  background_tasks: FastAPI BackgroundTasks instance to run the ingestion process asynchronously.
 
274
  return DocumentResponse.model_validate(document)
275
 
276
 
277
+ @router.get("/{document_id}/status", response_model=DocumentStatusResponse)
278
+ def get_document_status(
279
+ document_id: str,
280
+ user: User = Depends(get_current_user),
281
+ db: Session = Depends(get_db),
282
+ ):
283
+ """
284
+ Poll processing status for a single uploaded document.
285
+
286
+ This endpoint lets clients refresh the upload lifecycle without fetching
287
+ the entire document list. The returned status is one of the existing
288
+ document states: pending, processing, ready, or failed.
289
+ """
290
+ doc = db.query(Document).filter(
291
+ Document.id == document_id,
292
+ Document.user_id == user.id,
293
+ ).first()
294
+
295
+ if not doc:
296
+ raise HTTPException(status_code=404, detail="Document not found")
297
+
298
+ return DocumentStatusResponse.model_validate(doc)
299
+
300
+
301
  @router.get("/", response_model=DocumentListResponse)
302
  def list_documents(
303
  page: int = Query(1, ge=1),
backend/app/schemas.py CHANGED
@@ -75,6 +75,17 @@ class DocumentResponse(BaseModel):
75
  from_attributes = True
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
78
  class DocumentListResponse(BaseModel):
79
  items: List[DocumentResponse]
80
  total: int
 
75
  from_attributes = True
76
 
77
 
78
+ class DocumentStatusResponse(BaseModel):
79
+ id: str
80
+ status: str
81
+ page_count: int
82
+ chunk_count: int
83
+ error_message: Optional[str] = None
84
+
85
+ class Config:
86
+ from_attributes = True
87
+
88
+
89
  class DocumentListResponse(BaseModel):
90
  items: List[DocumentResponse]
91
  total: int