Jay-10020 commited on
Commit
32c68ab
·
1 Parent(s): 33dfbb4

rag/ingest-text api for audio

Browse files
Files changed (1) hide show
  1. api/main.py +54 -0
api/main.py CHANGED
@@ -266,6 +266,60 @@ async def upload_document(
266
  raise HTTPException(status_code=500, detail=str(e))
267
 
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  @app.post("/query", response_model=QueryResponse)
270
  async def query_documents(request: QueryRequest):
271
  """Query RAG system with semantic search"""
 
266
  raise HTTPException(status_code=500, detail=str(e))
267
 
268
 
269
+ @app.post("/rag/ingest-text")
270
+ async def ingest_text_to_rag(
271
+ text: str = Form(...),
272
+ lecture_title: str = Form("Transcript"),
273
+ institution_id: Optional[str] = Form(None),
274
+ course_id: Optional[str] = Form(None),
275
+ teacher_id: Optional[str] = Form(None),
276
+ recording_id: Optional[str] = Form(None),
277
+ ):
278
+ """Ingest edited plain text directly into the RAG knowledge base.
279
+
280
+ Used when a teacher corrects a lecture transcript in the app after the
281
+ initial auto-transcription — ensures the corrected text is what students
282
+ search against, not the original version.
283
+ """
284
+ import tempfile
285
+ import time as _time
286
+
287
+ try:
288
+ doc_processor = get_doc_processor()
289
+ vector_store = get_vector_store()
290
+
291
+ # Write the text to a temporary file so doc_processor can chunk it
292
+ tmp = tempfile.NamedTemporaryFile(
293
+ mode="w", suffix=".txt", delete=False, encoding="utf-8"
294
+ )
295
+ tmp.write(text)
296
+ tmp.close()
297
+
298
+ metadata = {
299
+ "institution_id": institution_id,
300
+ "course_id": course_id,
301
+ "lecture_title": lecture_title,
302
+ "teacher_id": teacher_id,
303
+ "content_type": "lecture_transcript",
304
+ "recording_id": recording_id,
305
+ }
306
+
307
+ try:
308
+ chunks = doc_processor.process_document(tmp.name, metadata)
309
+ finally:
310
+ Path(tmp.name).unlink(missing_ok=True)
311
+
312
+ texts = [c.text for c in chunks]
313
+ metadatas = [c.metadata for c in chunks]
314
+ doc_id = recording_id or f"text_{int(_time.time())}"
315
+ ids = [f"{doc_id}_chunk_{i}" for i in range(len(chunks))]
316
+
317
+ vector_store.add_documents(texts, metadatas, ids)
318
+ return {"status": "success", "chunks_added": len(chunks)}
319
+ except Exception as e:
320
+ raise HTTPException(status_code=500, detail=str(e))
321
+
322
+
323
  @app.post("/query", response_model=QueryResponse)
324
  async def query_documents(request: QueryRequest):
325
  """Query RAG system with semantic search"""