[NOTICKET] [document]: add doctypes endpoint

#7
.gitignore CHANGED
@@ -36,4 +36,5 @@ playground_flush_cache.py
36
  playground_create_user.py
37
  API_CONTRACT.md
38
  context_engineering/
39
- sample_file/
 
 
36
  playground_create_user.py
37
  API_CONTRACT.md
38
  context_engineering/
39
+ sample_file/
40
+ test_tesseract.py
src/api/v1/document.py CHANGED
@@ -24,6 +24,27 @@ class DocumentResponse(BaseModel):
24
  created_at: str
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
28
  @log_execution(logger)
29
  async def list_documents(
 
24
  created_at: str
25
 
26
 
27
+ # NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
28
+ _DOC_TYPES = [
29
+ {"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
30
+ {"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
31
+ {"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
32
+ {"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
33
+ {"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
34
+ ]
35
+
36
+
37
+ @router.get(
38
+ "/documents/doctypes",
39
+ summary="List supported document types",
40
+ response_description="All document types supported by DataEyond with their size limits and status.",
41
+ )
42
+ @log_execution(logger)
43
+ async def get_document_types():
44
+ """Return every document type DataEyond can process, with max file size and active/inactive status."""
45
+ return {"status": "success", "data": _DOC_TYPES}
46
+
47
+
48
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
49
  @log_execution(logger)
50
  async def list_documents(
src/pipeline/document_pipeline/document_pipeline.py CHANGED
@@ -10,7 +10,9 @@ from src.storage.az_blob.az_blob import blob_storage
10
 
11
  logger = get_logger("document_pipeline")
12
 
 
13
  SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
 
14
 
15
 
16
  class DocumentPipeline:
@@ -21,6 +23,12 @@ class DocumentPipeline:
21
  content = await file.read()
22
  file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
23
 
 
 
 
 
 
 
24
  if file_type not in SUPPORTED_FILE_TYPES:
25
  raise HTTPException(
26
  status_code=400,
 
10
 
11
  logger = get_logger("document_pipeline")
12
 
13
+ # NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
14
  SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
15
+ MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10 MB
16
 
17
 
18
  class DocumentPipeline:
 
23
  content = await file.read()
24
  file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
25
 
26
+ if len(content) > MAX_FILE_SIZE_BYTES:
27
+ raise HTTPException(
28
+ status_code=400,
29
+ detail="File size exceeds maximum allowed size of 10 MB.",
30
+ )
31
+
32
  if file_type not in SUPPORTED_FILE_TYPES:
33
  raise HTTPException(
34
  status_code=400,