SanskarModi's picture
added limit checks for groq freee tier api
965240e
"""Routes for uploading and processing documents."""
import uuid
from pathlib import Path
from typing import Dict, List
from app.config import settings
from app.ingestion.pipeline import ingest_pdf
from app.models.ingestion import Chunk
from app.retrieval.chunk_registry import get_chunks
from fastapi import APIRouter, File, HTTPException, UploadFile
router = APIRouter()
DOC_STORAGE = Path(settings.docs_path)
DOC_STORAGE.mkdir(parents=True, exist_ok=True)
@router.post("/upload", response_model=Dict[str, List[Chunk]])
def upload_documents(
files: List[UploadFile] = File(...),
) -> Dict[str, List[Chunk]]:
"""Upload one or more PDF documents.
Args:
files: Uploaded PDF files.
Returns:
A mapping of doc_id to raw extracted page segments.
"""
results: Dict[str, List[Chunk]] = {}
for file in files:
if not file.filename.lower().endswith(".pdf"):
msg = "Only PDF files are supported."
raise HTTPException(status_code=400, detail=msg)
# Generate a unique document ID
doc_id = str(uuid.uuid4())
save_path = DOC_STORAGE / f"{doc_id}.pdf"
with save_path.open("wb") as f:
f.write(file.file.read())
chunks = ingest_pdf(save_path, doc_id)
results[doc_id] = chunks
return results
@router.delete("/remove/{doc_id}")
def remove_document(doc_id: str) -> dict:
"""Remove a document and its chunks from the system.
Args:
doc_id: Document ID to remove
Returns:
Status message
"""
from app.ingestion.indexing import COLLECTION_NAME, get_qdrant_client
from app.retrieval.chunk_registry import _CHUNKS
# Remove chunks from registry
chunks_to_remove = [cid for cid, chunk in _CHUNKS.items() if chunk.doc_id == doc_id]
for chunk_id in chunks_to_remove:
_CHUNKS.pop(chunk_id, None)
# Remove from Qdrant
if chunks_to_remove:
try:
client = get_qdrant_client()
if client.collection_exists(COLLECTION_NAME):
client.delete(
collection_name=COLLECTION_NAME,
points_selector=chunks_to_remove,
)
except Exception as e:
print(f"Error removing from Qdrant: {e}")
# Remove PDF file
pdf_path = DOC_STORAGE / f"{doc_id}.pdf"
if pdf_path.exists():
pdf_path.unlink()
return {
"status": "success",
"message": f"Removed document {doc_id}",
"chunks_removed": len(chunks_to_remove),
}
@router.get("/list")
def list_documents() -> dict:
"""List all currently loaded documents.
Returns:
Dictionary with document information
"""
chunks = get_chunks()
doc_ids = list(set(chunk.doc_id for chunk in chunks))
return {
"total_documents": len(doc_ids),
"total_chunks": len(chunks),
"doc_ids": doc_ids,
}
@router.get("/token-counts")
def get_document_token_counts() -> dict:
"""Get approximate token counts for all documents.
Returns:
Dictionary with doc_id -> token_count mapping and max limit
"""
from app.retrieval.chunk_registry import get_chunks
chunks = get_chunks()
doc_token_counts = {}
for chunk in chunks:
# Rough estimate: 1 token ≈ 4 characters
tokens = len(chunk.text) // 4
doc_token_counts[chunk.doc_id] = doc_token_counts.get(chunk.doc_id, 0) + tokens
return {
"doc_token_counts": doc_token_counts,
"max_summary_tokens": settings.max_summary_tokens,
}