""" SPARKNET Document API Routes Endpoints for document upload, processing, and management. """ from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Depends, BackgroundTasks from fastapi.responses import StreamingResponse from typing import List, Optional from pathlib import Path from datetime import datetime import hashlib import shutil import uuid import io import sys # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from api.schemas import ( DocumentUploadResponse, DocumentResponse, DocumentMetadata, DocumentDetailResponse, ChunksResponse, ChunkInfo, OCRRegionInfo, LayoutRegionInfo, DocumentStatus, IndexRequest, IndexResponse, BatchIndexRequest, BatchIndexResponse ) from loguru import logger router = APIRouter() # In-memory document store (replace with database in production) _documents = {} _processing_tasks = {} # Supported file types SUPPORTED_EXTENSIONS = { '.pdf': 'application/pdf', '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.tiff': 'image/tiff', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', '.txt': 'text/plain', '.md': 'text/markdown', } UPLOAD_DIR = PROJECT_ROOT / "uploads" / "documents" UPLOAD_DIR.mkdir(parents=True, exist_ok=True) def generate_doc_id(filename: str, content: bytes) -> str: """Generate unique document ID from filename and content hash.""" content_hash = hashlib.md5(content[:4096]).hexdigest()[:8] timestamp = datetime.now().strftime("%Y%m%d%H%M%S") return f"doc_{timestamp}_{content_hash}" async def process_document_task(doc_id: str, file_path: Path, file_type: str): """Background task to process a document.""" try: logger.info(f"Processing document: {doc_id}") _documents[doc_id]["status"] = DocumentStatus.PROCESSING # Try to use actual document processor try: from src.document.pipeline.processor import DocumentProcessor, PipelineConfig config = PipelineConfig( ocr_enabled=True, layout_enabled=True, chunking_enabled=True, ) processor = DocumentProcessor(config) result = processor.process(str(file_path)) # Extract data from result chunks = [] for i, chunk in enumerate(getattr(result, 'chunks', [])): chunks.append({ "chunk_id": f"{doc_id}_chunk_{i}", "doc_id": doc_id, "text": getattr(chunk, 'text', str(chunk)), "chunk_type": getattr(chunk, 'chunk_type', 'text'), "page_num": getattr(chunk, 'page', 0), "confidence": getattr(chunk, 'confidence', 1.0), "bbox": getattr(chunk, 'bbox', None), }) _documents[doc_id].update({ "status": DocumentStatus.COMPLETED, "raw_text": getattr(result, 'raw_text', ''), "chunks": chunks, "page_count": getattr(result, 'page_count', 1), "ocr_regions": getattr(result, 'ocr_regions', []), "layout_regions": getattr(result, 'layout_regions', []), "processing_time": getattr(result, 'processing_time', 0.0), "updated_at": datetime.now(), }) logger.success(f"Document {doc_id} processed successfully: {len(chunks)} chunks") except Exception as proc_error: logger.warning(f"Full processor unavailable: {proc_error}, using fallback") # Fallback: simple text extraction raw_text = "" if file_type in ['.pdf']: try: import fitz doc = fitz.open(str(file_path)) for page in doc: raw_text += page.get_text() + "\n" page_count = len(doc) doc.close() except Exception as e: logger.error(f"PDF extraction failed: {e}") page_count = 1 elif file_type in ['.txt', '.md']: raw_text = file_path.read_text(errors='ignore') page_count = 1 elif file_type == '.docx': try: from docx import Document doc = Document(str(file_path)) raw_text = "\n".join([p.text for p in doc.paragraphs]) page_count = max(1, len(raw_text) // 3000) except Exception as e: logger.error(f"DOCX extraction failed: {e}") page_count = 1 elif file_type == '.xlsx': try: import pandas as pd df_dict = pd.read_excel(str(file_path), sheet_name=None) for sheet_name, df in df_dict.items(): raw_text += f"\n=== Sheet: {sheet_name} ===\n" raw_text += df.to_string() + "\n" page_count = len(df_dict) except Exception as e: logger.error(f"XLSX extraction failed: {e}") page_count = 1 elif file_type == '.pptx': try: from pptx import Presentation prs = Presentation(str(file_path)) for i, slide in enumerate(prs.slides): raw_text += f"\n=== Slide {i+1} ===\n" for shape in slide.shapes: if hasattr(shape, "text"): raw_text += shape.text + "\n" page_count = len(prs.slides) except Exception as e: logger.error(f"PPTX extraction failed: {e}") page_count = 1 # Create simple chunks chunks = [] chunk_size = 1000 text_chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size - 100)] for i, text in enumerate(text_chunks): if text.strip(): chunks.append({ "chunk_id": f"{doc_id}_chunk_{i}", "doc_id": doc_id, "text": text.strip(), "chunk_type": "text", "page_num": min(i * chunk_size // 3000 + 1, page_count), "confidence": 1.0, "bbox": None, }) _documents[doc_id].update({ "status": DocumentStatus.COMPLETED, "raw_text": raw_text, "chunks": chunks, "page_count": page_count, "ocr_regions": [], "layout_regions": [], "processing_time": 0.0, "updated_at": datetime.now(), }) logger.info(f"Document {doc_id} processed with fallback: {len(chunks)} chunks") except Exception as e: logger.error(f"Document processing failed for {doc_id}: {e}") _documents[doc_id]["status"] = DocumentStatus.ERROR _documents[doc_id]["error"] = str(e) @router.post("/upload", response_model=DocumentUploadResponse) async def upload_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), auto_process: bool = Query(True, description="Automatically process after upload"), auto_index: bool = Query(False, description="Automatically index to RAG after processing"), ): """ Upload a document for processing. Supported formats: PDF, PNG, JPG, DOCX, XLSX, PPTX, TXT, MD """ # Validate file extension file_ext = Path(file.filename).suffix.lower() if file_ext not in SUPPORTED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"Unsupported file type: {file_ext}. Supported: {list(SUPPORTED_EXTENSIONS.keys())}" ) # Read file content content = await file.read() if len(content) == 0: raise HTTPException(status_code=400, detail="Empty file uploaded") # Generate document ID doc_id = generate_doc_id(file.filename, content) # Save file file_path = UPLOAD_DIR / f"{doc_id}{file_ext}" with open(file_path, "wb") as f: f.write(content) # Create document record _documents[doc_id] = { "doc_id": doc_id, "filename": file.filename, "file_type": file_ext, "file_path": str(file_path), "status": DocumentStatus.PENDING, "raw_text": "", "chunks": [], "page_count": 0, "ocr_regions": [], "layout_regions": [], "indexed": False, "indexed_chunks": 0, "processing_time": None, "created_at": datetime.now(), "updated_at": None, "auto_index": auto_index, } # Start processing in background if auto_process: background_tasks.add_task(process_document_task, doc_id, file_path, file_ext) status = DocumentStatus.PROCESSING message = "Document uploaded and processing started" else: status = DocumentStatus.PENDING message = "Document uploaded successfully. Call /process to begin processing." _documents[doc_id]["status"] = status return DocumentUploadResponse( doc_id=doc_id, filename=file.filename, status=status, message=message, created_at=_documents[doc_id]["created_at"] ) @router.get("", response_model=List[DocumentMetadata]) async def list_documents( status: Optional[DocumentStatus] = Query(None, description="Filter by status"), indexed: Optional[bool] = Query(None, description="Filter by indexed status"), limit: int = Query(50, ge=1, le=200), offset: int = Query(0, ge=0), ): """List all documents with optional filtering.""" docs = list(_documents.values()) # Apply filters if status: docs = [d for d in docs if d["status"] == status] if indexed is not None: docs = [d for d in docs if d.get("indexed", False) == indexed] # Apply pagination docs = docs[offset:offset + limit] return [ DocumentMetadata( doc_id=d["doc_id"], filename=d["filename"], file_type=d["file_type"], page_count=d.get("page_count", 0), chunk_count=len(d.get("chunks", [])), text_length=len(d.get("raw_text", "")), status=d["status"], indexed=d.get("indexed", False), indexed_chunks=d.get("indexed_chunks", 0), processing_time=d.get("processing_time"), created_at=d["created_at"], updated_at=d.get("updated_at"), ) for d in docs ] @router.get("/{doc_id}", response_model=DocumentResponse) async def get_document( doc_id: str, include_text: bool = Query(False, description="Include full raw text"), ): """Get document by ID.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] return DocumentResponse( doc_id=d["doc_id"], filename=d["filename"], file_type=d["file_type"], status=d["status"], metadata=DocumentMetadata( doc_id=d["doc_id"], filename=d["filename"], file_type=d["file_type"], page_count=d.get("page_count", 0), chunk_count=len(d.get("chunks", [])), text_length=len(d.get("raw_text", "")), status=d["status"], indexed=d.get("indexed", False), indexed_chunks=d.get("indexed_chunks", 0), processing_time=d.get("processing_time"), created_at=d["created_at"], updated_at=d.get("updated_at"), ), raw_text=d.get("raw_text") if include_text else None, preview=d.get("raw_text", "")[:500] if d.get("raw_text") else None, ) @router.get("/{doc_id}/detail", response_model=DocumentDetailResponse) async def get_document_detail(doc_id: str): """Get detailed document information including chunks and regions.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] return DocumentDetailResponse( doc_id=d["doc_id"], filename=d["filename"], status=d["status"], metadata=DocumentMetadata( doc_id=d["doc_id"], filename=d["filename"], file_type=d["file_type"], page_count=d.get("page_count", 0), chunk_count=len(d.get("chunks", [])), text_length=len(d.get("raw_text", "")), status=d["status"], indexed=d.get("indexed", False), indexed_chunks=d.get("indexed_chunks", 0), processing_time=d.get("processing_time"), created_at=d["created_at"], updated_at=d.get("updated_at"), ), chunks=[ChunkInfo(**c) for c in d.get("chunks", [])], ocr_regions=[OCRRegionInfo(**r) for r in d.get("ocr_regions", []) if isinstance(r, dict)], layout_regions=[LayoutRegionInfo(**r) for r in d.get("layout_regions", []) if isinstance(r, dict)], ) @router.get("/{doc_id}/chunks", response_model=ChunksResponse) async def get_document_chunks( doc_id: str, page: Optional[int] = Query(None, description="Filter by page number"), chunk_type: Optional[str] = Query(None, description="Filter by chunk type"), ): """Get all chunks for a document.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] chunks = d.get("chunks", []) # Apply filters if page is not None: chunks = [c for c in chunks if c.get("page_num") == page] if chunk_type: chunks = [c for c in chunks if c.get("chunk_type") == chunk_type] return ChunksResponse( doc_id=doc_id, total_chunks=len(chunks), chunks=[ChunkInfo(**c) for c in chunks], ) @router.post("/{doc_id}/process") async def process_document( doc_id: str, background_tasks: BackgroundTasks, force: bool = Query(False, description="Force reprocessing"), ): """Trigger document processing.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] if d["status"] == DocumentStatus.PROCESSING: raise HTTPException(status_code=400, detail="Document is already being processed") if d["status"] == DocumentStatus.COMPLETED and not force: raise HTTPException( status_code=400, detail="Document already processed. Use force=true to reprocess." ) file_path = Path(d["file_path"]) if not file_path.exists(): raise HTTPException(status_code=404, detail="Document file not found") background_tasks.add_task(process_document_task, doc_id, file_path, d["file_type"]) _documents[doc_id]["status"] = DocumentStatus.PROCESSING return {"doc_id": doc_id, "status": "processing", "message": "Processing started"} @router.delete("/{doc_id}") async def delete_document(doc_id: str): """Delete a document.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] # Delete file file_path = Path(d["file_path"]) if file_path.exists(): file_path.unlink() # Remove from store del _documents[doc_id] return {"doc_id": doc_id, "status": "deleted", "message": "Document deleted successfully"} @router.post("/{doc_id}/index", response_model=IndexResponse) async def index_document(doc_id: str, force_reindex: bool = Query(False)): """Index a document to the RAG vector store.""" if doc_id not in _documents: raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}") d = _documents[doc_id] if d["status"] != DocumentStatus.COMPLETED: raise HTTPException( status_code=400, detail=f"Document not ready for indexing. Current status: {d['status']}" ) if d.get("indexed") and not force_reindex: return IndexResponse( doc_id=doc_id, status="already_indexed", chunks_indexed=d.get("indexed_chunks", 0), message="Document already indexed. Use force_reindex=true to reindex." ) try: # Try to use actual indexer from src.rag.indexer import DocumentIndexer from src.rag.embeddings import get_embedding_model from src.rag.store import get_vector_store embeddings = get_embedding_model() store = get_vector_store() indexer = DocumentIndexer(embeddings, store) # Index chunks chunks_to_index = d.get("chunks", []) indexed_count = 0 for chunk in chunks_to_index: try: indexer.index_chunk( text=chunk["text"], document_id=doc_id, chunk_id=chunk["chunk_id"], metadata={ "filename": d["filename"], "page_num": chunk.get("page_num"), "chunk_type": chunk.get("chunk_type", "text"), } ) indexed_count += 1 except Exception as e: logger.warning(f"Failed to index chunk {chunk['chunk_id']}: {e}") _documents[doc_id]["indexed"] = True _documents[doc_id]["indexed_chunks"] = indexed_count _documents[doc_id]["status"] = DocumentStatus.INDEXED return IndexResponse( doc_id=doc_id, status="indexed", chunks_indexed=indexed_count, message=f"Successfully indexed {indexed_count} chunks" ) except Exception as e: logger.error(f"Indexing failed for {doc_id}: {e}") raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}") @router.post("/batch-index", response_model=BatchIndexResponse) async def batch_index_documents(request: BatchIndexRequest): """Batch index multiple documents.""" results = [] successful = 0 failed = 0 for doc_id in request.doc_ids: try: result = await index_document(doc_id, request.force_reindex) results.append(result) if result.status in ["indexed", "already_indexed"]: successful += 1 else: failed += 1 except HTTPException as e: results.append(IndexResponse( doc_id=doc_id, status="error", chunks_indexed=0, message=e.detail )) failed += 1 return BatchIndexResponse( total_requested=len(request.doc_ids), successful=successful, failed=failed, results=results ) # Export document store for other modules def get_document_store(): """Get the in-memory document store.""" return _documents