Jay-10020 commited on
Commit
c766e4c
·
1 Parent(s): 877b342

chunking of documents

Browse files
Files changed (1) hide show
  1. api/main.py +46 -0
api/main.py CHANGED
@@ -61,6 +61,13 @@ class DocumentUploadResponse(BaseModel):
61
  status: str
62
 
63
 
 
 
 
 
 
 
 
64
  class MCQGenerateRequest(BaseModel):
65
  source_type: str # "text", "document", "topic"
66
  source: str # text content, document name, or topic
@@ -264,6 +271,45 @@ async def upload_document(
264
  raise HTTPException(status_code=500, detail=str(e))
265
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  @app.post("/rag/ingest-text")
268
  async def ingest_text_to_rag(
269
  text: str = Form(...),
 
61
  status: str
62
 
63
 
64
+ class DocumentChunksResponse(BaseModel):
65
+ filename: str
66
+ chunks: List[dict]
67
+ embedding_model: str
68
+ total_chunks: int
69
+
70
+
71
  class MCQGenerateRequest(BaseModel):
72
  source_type: str # "text", "document", "topic"
73
  source: str # text content, document name, or topic
 
271
  raise HTTPException(status_code=500, detail=str(e))
272
 
273
 
274
+ @app.get("/documents/{filename}/chunks", response_model=DocumentChunksResponse)
275
+ async def get_document_chunks(filename: str):
276
+ """Get all chunks and embeddings for a specific document"""
277
+ try:
278
+ vector_store = get_vector_store()
279
+
280
+ # Get all documents from the vector store
281
+ all_docs = vector_store.data['documents']
282
+
283
+ # Filter chunks for this filename
284
+ doc_chunks = [
285
+ doc for doc in all_docs
286
+ if doc.get('id', '').startswith(f"{filename}_")
287
+ ]
288
+
289
+ if not doc_chunks:
290
+ raise HTTPException(status_code=404, detail=f"No chunks found for {filename}")
291
+
292
+ # Format chunks with embeddings
293
+ chunks = []
294
+ for doc in doc_chunks:
295
+ chunks.append({
296
+ 'text': doc['text'],
297
+ 'embedding': doc['embedding'].tolist() if hasattr(doc['embedding'], 'tolist') else doc['embedding'],
298
+ 'metadata': doc.get('metadata', {})
299
+ })
300
+
301
+ return DocumentChunksResponse(
302
+ filename=filename,
303
+ chunks=chunks,
304
+ embedding_model=vector_store.data['metadata'].get('embedding_model', 'unknown'),
305
+ total_chunks=len(chunks)
306
+ )
307
+ except HTTPException:
308
+ raise
309
+ except Exception as e:
310
+ raise HTTPException(status_code=500, detail=str(e))
311
+
312
+
313
  @app.post("/rag/ingest-text")
314
  async def ingest_text_to_rag(
315
  text: str = Form(...),