Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / api /routes /documents.py

MHamdan

Initial commit: SPARKNET framework

d520909 about 1 month ago

raw

history blame

19.6 kB

	"""
	SPARKNET Document API Routes
	Endpoints for document upload, processing, and management.
	"""

	from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Depends, BackgroundTasks
	from fastapi.responses import StreamingResponse
	from typing import List, Optional
	from pathlib import Path
	from datetime import datetime
	import hashlib
	import shutil
	import uuid
	import io
	import sys

	# Add project root to path
	PROJECT_ROOT = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(PROJECT_ROOT))

	from api.schemas import (
	DocumentUploadResponse, DocumentResponse, DocumentMetadata,
	DocumentDetailResponse, ChunksResponse, ChunkInfo,
	OCRRegionInfo, LayoutRegionInfo, DocumentStatus,
	IndexRequest, IndexResponse, BatchIndexRequest, BatchIndexResponse
	)
	from loguru import logger

	router = APIRouter()

	# In-memory document store (replace with database in production)
	_documents = {}
	_processing_tasks = {}

	# Supported file types
	SUPPORTED_EXTENSIONS = {
	'.pdf': 'application/pdf',
	'.png': 'image/png',
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.tiff': 'image/tiff',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
	'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
	'.txt': 'text/plain',
	'.md': 'text/markdown',
	}

	UPLOAD_DIR = PROJECT_ROOT / "uploads" / "documents"
	UPLOAD_DIR.mkdir(parents=True, exist_ok=True)


	def generate_doc_id(filename: str, content: bytes) -> str:
	"""Generate unique document ID from filename and content hash."""
	content_hash = hashlib.md5(content[:4096]).hexdigest()[:8]
	timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
	return f"doc_{timestamp}_{content_hash}"


	async def process_document_task(doc_id: str, file_path: Path, file_type: str):
	"""Background task to process a document."""
	try:
	logger.info(f"Processing document: {doc_id}")
	_documents[doc_id]["status"] = DocumentStatus.PROCESSING

	# Try to use actual document processor
	try:
	from src.document.pipeline.processor import DocumentProcessor, PipelineConfig

	config = PipelineConfig(
	ocr_enabled=True,
	layout_enabled=True,
	chunking_enabled=True,
	)
	processor = DocumentProcessor(config)
	result = processor.process(str(file_path))

	# Extract data from result
	chunks = []
	for i, chunk in enumerate(getattr(result, 'chunks', [])):
	chunks.append({
	"chunk_id": f"{doc_id}_chunk_{i}",
	"doc_id": doc_id,
	"text": getattr(chunk, 'text', str(chunk)),
	"chunk_type": getattr(chunk, 'chunk_type', 'text'),
	"page_num": getattr(chunk, 'page', 0),
	"confidence": getattr(chunk, 'confidence', 1.0),
	"bbox": getattr(chunk, 'bbox', None),
	})

	_documents[doc_id].update({
	"status": DocumentStatus.COMPLETED,
	"raw_text": getattr(result, 'raw_text', ''),
	"chunks": chunks,
	"page_count": getattr(result, 'page_count', 1),
	"ocr_regions": getattr(result, 'ocr_regions', []),
	"layout_regions": getattr(result, 'layout_regions', []),
	"processing_time": getattr(result, 'processing_time', 0.0),
	"updated_at": datetime.now(),
	})

	logger.success(f"Document {doc_id} processed successfully: {len(chunks)} chunks")

	except Exception as proc_error:
	logger.warning(f"Full processor unavailable: {proc_error}, using fallback")
	# Fallback: simple text extraction
	raw_text = ""

	if file_type in ['.pdf']:
	try:
	import fitz
	doc = fitz.open(str(file_path))
	for page in doc:
	raw_text += page.get_text() + "\n"
	page_count = len(doc)
	doc.close()
	except Exception as e:
	logger.error(f"PDF extraction failed: {e}")
	page_count = 1

	elif file_type in ['.txt', '.md']:
	raw_text = file_path.read_text(errors='ignore')
	page_count = 1

	elif file_type == '.docx':
	try:
	from docx import Document
	doc = Document(str(file_path))
	raw_text = "\n".join([p.text for p in doc.paragraphs])
	page_count = max(1, len(raw_text) // 3000)
	except Exception as e:
	logger.error(f"DOCX extraction failed: {e}")
	page_count = 1

	elif file_type == '.xlsx':
	try:
	import pandas as pd
	df_dict = pd.read_excel(str(file_path), sheet_name=None)
	for sheet_name, df in df_dict.items():
	raw_text += f"\n=== Sheet: {sheet_name} ===\n"
	raw_text += df.to_string() + "\n"
	page_count = len(df_dict)
	except Exception as e:
	logger.error(f"XLSX extraction failed: {e}")
	page_count = 1

	elif file_type == '.pptx':
	try:
	from pptx import Presentation
	prs = Presentation(str(file_path))
	for i, slide in enumerate(prs.slides):
	raw_text += f"\n=== Slide {i+1} ===\n"
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	raw_text += shape.text + "\n"
	page_count = len(prs.slides)
	except Exception as e:
	logger.error(f"PPTX extraction failed: {e}")
	page_count = 1

	# Create simple chunks
	chunks = []
	chunk_size = 1000
	text_chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size - 100)]
	for i, text in enumerate(text_chunks):
	if text.strip():
	chunks.append({
	"chunk_id": f"{doc_id}_chunk_{i}",
	"doc_id": doc_id,
	"text": text.strip(),
	"chunk_type": "text",
	"page_num": min(i * chunk_size // 3000 + 1, page_count),
	"confidence": 1.0,
	"bbox": None,
	})

	_documents[doc_id].update({
	"status": DocumentStatus.COMPLETED,
	"raw_text": raw_text,
	"chunks": chunks,
	"page_count": page_count,
	"ocr_regions": [],
	"layout_regions": [],
	"processing_time": 0.0,
	"updated_at": datetime.now(),
	})

	logger.info(f"Document {doc_id} processed with fallback: {len(chunks)} chunks")

	except Exception as e:
	logger.error(f"Document processing failed for {doc_id}: {e}")
	_documents[doc_id]["status"] = DocumentStatus.ERROR
	_documents[doc_id]["error"] = str(e)


	@router.post("/upload", response_model=DocumentUploadResponse)
	async def upload_document(
	background_tasks: BackgroundTasks,
	file: UploadFile = File(...),
	auto_process: bool = Query(True, description="Automatically process after upload"),
	auto_index: bool = Query(False, description="Automatically index to RAG after processing"),
	):
	"""
	Upload a document for processing.

	Supported formats: PDF, PNG, JPG, DOCX, XLSX, PPTX, TXT, MD
	"""
	# Validate file extension
	file_ext = Path(file.filename).suffix.lower()
	if file_ext not in SUPPORTED_EXTENSIONS:
	raise HTTPException(
	status_code=400,
	detail=f"Unsupported file type: {file_ext}. Supported: {list(SUPPORTED_EXTENSIONS.keys())}"
	)

	# Read file content
	content = await file.read()
	if len(content) == 0:
	raise HTTPException(status_code=400, detail="Empty file uploaded")

	# Generate document ID
	doc_id = generate_doc_id(file.filename, content)

	# Save file
	file_path = UPLOAD_DIR / f"{doc_id}{file_ext}"
	with open(file_path, "wb") as f:
	f.write(content)

	# Create document record
	_documents[doc_id] = {
	"doc_id": doc_id,
	"filename": file.filename,
	"file_type": file_ext,
	"file_path": str(file_path),
	"status": DocumentStatus.PENDING,
	"raw_text": "",
	"chunks": [],
	"page_count": 0,
	"ocr_regions": [],
	"layout_regions": [],
	"indexed": False,
	"indexed_chunks": 0,
	"processing_time": None,
	"created_at": datetime.now(),
	"updated_at": None,
	"auto_index": auto_index,
	}

	# Start processing in background
	if auto_process:
	background_tasks.add_task(process_document_task, doc_id, file_path, file_ext)
	status = DocumentStatus.PROCESSING
	message = "Document uploaded and processing started"
	else:
	status = DocumentStatus.PENDING
	message = "Document uploaded successfully. Call /process to begin processing."

	_documents[doc_id]["status"] = status

	return DocumentUploadResponse(
	doc_id=doc_id,
	filename=file.filename,
	status=status,
	message=message,
	created_at=_documents[doc_id]["created_at"]
	)


	@router.get("", response_model=List[DocumentMetadata])
	async def list_documents(
	status: Optional[DocumentStatus] = Query(None, description="Filter by status"),
	indexed: Optional[bool] = Query(None, description="Filter by indexed status"),
	limit: int = Query(50, ge=1, le=200),
	offset: int = Query(0, ge=0),
	):
	"""List all documents with optional filtering."""
	docs = list(_documents.values())

	# Apply filters
	if status:
	docs = [d for d in docs if d["status"] == status]
	if indexed is not None:
	docs = [d for d in docs if d.get("indexed", False) == indexed]

	# Apply pagination
	docs = docs[offset:offset + limit]

	return [
	DocumentMetadata(
	doc_id=d["doc_id"],
	filename=d["filename"],
	file_type=d["file_type"],
	page_count=d.get("page_count", 0),
	chunk_count=len(d.get("chunks", [])),
	text_length=len(d.get("raw_text", "")),
	status=d["status"],
	indexed=d.get("indexed", False),
	indexed_chunks=d.get("indexed_chunks", 0),
	processing_time=d.get("processing_time"),
	created_at=d["created_at"],
	updated_at=d.get("updated_at"),
	)
	for d in docs
	]


	@router.get("/{doc_id}", response_model=DocumentResponse)
	async def get_document(
	doc_id: str,
	include_text: bool = Query(False, description="Include full raw text"),
	):
	"""Get document by ID."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]

	return DocumentResponse(
	doc_id=d["doc_id"],
	filename=d["filename"],
	file_type=d["file_type"],
	status=d["status"],
	metadata=DocumentMetadata(
	doc_id=d["doc_id"],
	filename=d["filename"],
	file_type=d["file_type"],
	page_count=d.get("page_count", 0),
	chunk_count=len(d.get("chunks", [])),
	text_length=len(d.get("raw_text", "")),
	status=d["status"],
	indexed=d.get("indexed", False),
	indexed_chunks=d.get("indexed_chunks", 0),
	processing_time=d.get("processing_time"),
	created_at=d["created_at"],
	updated_at=d.get("updated_at"),
	),
	raw_text=d.get("raw_text") if include_text else None,
	preview=d.get("raw_text", "")[:500] if d.get("raw_text") else None,
	)


	@router.get("/{doc_id}/detail", response_model=DocumentDetailResponse)
	async def get_document_detail(doc_id: str):
	"""Get detailed document information including chunks and regions."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]

	return DocumentDetailResponse(
	doc_id=d["doc_id"],
	filename=d["filename"],
	status=d["status"],
	metadata=DocumentMetadata(
	doc_id=d["doc_id"],
	filename=d["filename"],
	file_type=d["file_type"],
	page_count=d.get("page_count", 0),
	chunk_count=len(d.get("chunks", [])),
	text_length=len(d.get("raw_text", "")),
	status=d["status"],
	indexed=d.get("indexed", False),
	indexed_chunks=d.get("indexed_chunks", 0),
	processing_time=d.get("processing_time"),
	created_at=d["created_at"],
	updated_at=d.get("updated_at"),
	),
	chunks=[ChunkInfo(**c) for c in d.get("chunks", [])],
	ocr_regions=[OCRRegionInfo(**r) for r in d.get("ocr_regions", []) if isinstance(r, dict)],
	layout_regions=[LayoutRegionInfo(**r) for r in d.get("layout_regions", []) if isinstance(r, dict)],
	)


	@router.get("/{doc_id}/chunks", response_model=ChunksResponse)
	async def get_document_chunks(
	doc_id: str,
	page: Optional[int] = Query(None, description="Filter by page number"),
	chunk_type: Optional[str] = Query(None, description="Filter by chunk type"),
	):
	"""Get all chunks for a document."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]
	chunks = d.get("chunks", [])

	# Apply filters
	if page is not None:
	chunks = [c for c in chunks if c.get("page_num") == page]
	if chunk_type:
	chunks = [c for c in chunks if c.get("chunk_type") == chunk_type]

	return ChunksResponse(
	doc_id=doc_id,
	total_chunks=len(chunks),
	chunks=[ChunkInfo(**c) for c in chunks],
	)


	@router.post("/{doc_id}/process")
	async def process_document(
	doc_id: str,
	background_tasks: BackgroundTasks,
	force: bool = Query(False, description="Force reprocessing"),
	):
	"""Trigger document processing."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]

	if d["status"] == DocumentStatus.PROCESSING:
	raise HTTPException(status_code=400, detail="Document is already being processed")

	if d["status"] == DocumentStatus.COMPLETED and not force:
	raise HTTPException(
	status_code=400,
	detail="Document already processed. Use force=true to reprocess."
	)

	file_path = Path(d["file_path"])
	if not file_path.exists():
	raise HTTPException(status_code=404, detail="Document file not found")

	background_tasks.add_task(process_document_task, doc_id, file_path, d["file_type"])
	_documents[doc_id]["status"] = DocumentStatus.PROCESSING

	return {"doc_id": doc_id, "status": "processing", "message": "Processing started"}


	@router.delete("/{doc_id}")
	async def delete_document(doc_id: str):
	"""Delete a document."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]

	# Delete file
	file_path = Path(d["file_path"])
	if file_path.exists():
	file_path.unlink()

	# Remove from store
	del _documents[doc_id]

	return {"doc_id": doc_id, "status": "deleted", "message": "Document deleted successfully"}


	@router.post("/{doc_id}/index", response_model=IndexResponse)
	async def index_document(doc_id: str, force_reindex: bool = Query(False)):
	"""Index a document to the RAG vector store."""
	if doc_id not in _documents:
	raise HTTPException(status_code=404, detail=f"Document not found: {doc_id}")

	d = _documents[doc_id]

	if d["status"] != DocumentStatus.COMPLETED:
	raise HTTPException(
	status_code=400,
	detail=f"Document not ready for indexing. Current status: {d['status']}"
	)

	if d.get("indexed") and not force_reindex:
	return IndexResponse(
	doc_id=doc_id,
	status="already_indexed",
	chunks_indexed=d.get("indexed_chunks", 0),
	message="Document already indexed. Use force_reindex=true to reindex."
	)

	try:
	# Try to use actual indexer
	from src.rag.indexer import DocumentIndexer
	from src.rag.embeddings import get_embedding_model
	from src.rag.store import get_vector_store

	embeddings = get_embedding_model()
	store = get_vector_store()
	indexer = DocumentIndexer(embeddings, store)

	# Index chunks
	chunks_to_index = d.get("chunks", [])
	indexed_count = 0

	for chunk in chunks_to_index:
	try:
	indexer.index_chunk(
	text=chunk["text"],
	document_id=doc_id,
	chunk_id=chunk["chunk_id"],
	metadata={
	"filename": d["filename"],
	"page_num": chunk.get("page_num"),
	"chunk_type": chunk.get("chunk_type", "text"),
	}
	)
	indexed_count += 1
	except Exception as e:
	logger.warning(f"Failed to index chunk {chunk['chunk_id']}: {e}")

	_documents[doc_id]["indexed"] = True
	_documents[doc_id]["indexed_chunks"] = indexed_count
	_documents[doc_id]["status"] = DocumentStatus.INDEXED

	return IndexResponse(
	doc_id=doc_id,
	status="indexed",
	chunks_indexed=indexed_count,
	message=f"Successfully indexed {indexed_count} chunks"
	)

	except Exception as e:
	logger.error(f"Indexing failed for {doc_id}: {e}")
	raise HTTPException(status_code=500, detail=f"Indexing failed: {str(e)}")


	@router.post("/batch-index", response_model=BatchIndexResponse)
	async def batch_index_documents(request: BatchIndexRequest):
	"""Batch index multiple documents."""
	results = []
	successful = 0
	failed = 0

	for doc_id in request.doc_ids:
	try:
	result = await index_document(doc_id, request.force_reindex)
	results.append(result)
	if result.status in ["indexed", "already_indexed"]:
	successful += 1
	else:
	failed += 1
	except HTTPException as e:
	results.append(IndexResponse(
	doc_id=doc_id,
	status="error",
	chunks_indexed=0,
	message=e.detail
	))
	failed += 1

	return BatchIndexResponse(
	total_requested=len(request.doc_ids),
	successful=successful,
	failed=failed,
	results=results
	)


	# Export document store for other modules
	def get_document_store():
	"""Get the in-memory document store."""
	return _documents