Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

[NOTICKET] [document]: add doctypes endpoint

by sofhiaazzhr - opened 21 days ago

←

Files changed (3) hide show

.gitignore CHANGED Viewed

@@ -36,4 +36,5 @@ playground_flush_cache.py
 playground_create_user.py
 API_CONTRACT.md
 context_engineering/
-sample_file/

 playground_create_user.py
 API_CONTRACT.md
 context_engineering/
+sample_file/
+test_tesseract.py

src/api/v1/document.py CHANGED Viewed

@@ -24,6 +24,27 @@ class DocumentResponse(BaseModel):
     created_at: str
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 @log_execution(logger)
 async def list_documents(

     created_at: str
+# NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
+_DOC_TYPES = [
+    {"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
+]
+@router.get(
+    "/documents/doctypes",
+    summary="List supported document types",
+    response_description="All document types supported by DataEyond with their size limits and status.",
+)
+@log_execution(logger)
+async def get_document_types():
+    """Return every document type DataEyond can process, with max file size and active/inactive status."""
+    return {"status": "success", "data": _DOC_TYPES}
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 @log_execution(logger)
 async def list_documents(

src/pipeline/document_pipeline/document_pipeline.py CHANGED Viewed

@@ -10,7 +10,9 @@ from src.storage.az_blob.az_blob import blob_storage
 logger = get_logger("document_pipeline")
 SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
 class DocumentPipeline:
@@ -21,6 +23,12 @@ class DocumentPipeline:
         content = await file.read()
         file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
         if file_type not in SUPPORTED_FILE_TYPES:
             raise HTTPException(
                 status_code=400,

 logger = get_logger("document_pipeline")
+# NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
 SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
+MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024  # 10 MB
 class DocumentPipeline:
         content = await file.read()
         file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
+        if len(content) > MAX_FILE_SIZE_BYTES:
+            raise HTTPException(
+                status_code=400,
+                detail="File size exceeds maximum allowed size of 10 MB.",
+            )
         if file_type not in SUPPORTED_FILE_TYPES:
             raise HTTPException(
                 status_code=400,