SRA25 commited on
Commit
028d4a9
·
verified ·
1 Parent(s): 07c9a11

removed UploadDocs

Browse files
Files changed (1) hide show
  1. langgraph_init.py +69 -70
langgraph_init.py CHANGED
@@ -359,82 +359,81 @@ def process_docx(file):
359
  return docx_content
360
 
361
 
362
- class UploadDocs:
363
- def upload_documents(files):
364
- global vectorstore_retriever
 
365
 
366
- embedding_model = init_embed()
367
-
368
- all_documents = []
369
- for uploaded_file in files:
370
-
371
- if uploaded_file.type == "text/plain":
372
- # string_data = ( uploaded_file.read()).decode("utf-8")
373
- string_data = process_text(uploaded_file)
374
- all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
375
- elif uploaded_file.type == "application/pdf":
376
- pdf_text = process_pdf(uploaded_file)
377
-
378
- # pdf_bytes = io.BytesIO( uploaded_file.read())
379
- # reader = PyPDF2.PdfReader(pdf_bytes)
380
- # pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
381
- all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
382
-
383
- elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
384
- docx_content = process_docx(uploaded_file)
385
-
386
- # docx_bytes = io.BytesIO( uploaded_file.read())
387
- # docx_docs = dx(docx_bytes)
388
- # docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
389
- all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
390
- else:
391
- raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
392
 
393
- if not all_documents:
394
- raise Exception(status_code=400, detail="No supported documents uploaded.")
395
-
396
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
397
- text_chunks = text_splitter.split_documents(all_documents)
398
- print("text_chucks: ", text_chunks[:100])
399
-
400
- processed_chunks_with_ids = []
401
- for i, chunk in enumerate(text_chunks):
402
- # Generate a unique ID for each chunk
403
- # Option 1 (Recommended): Using UUID for global uniqueness
404
- # chunk_id = str(uuid.uuid4())
405
 
406
- # Option 2 (Alternative): Combining source file path with chunk index
407
- # This is good if you want IDs to be deterministic based on file/chunk.
408
- # You might need to make the file path more robust (e.g., hash it or normalize it).
409
- file_source = chunk.metadata.get('source', 'unknown_source')
410
- chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
411
-
412
- # Add the unique ID to the chunk's metadata
413
- # It's good practice to keep original metadata and just add your custom ID.
414
- chunk.metadata['doc_id'] = chunk_id
415
 
 
 
416
 
417
- processed_chunks_with_ids.append(chunk)
418
- # embeddings = [embedding_model.encode(doc_chunks.page_content, convert_to_numpy=True) for doc_chunks in processed_chunks_with_ids]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
- print(f"Split {len(processed_chunks_with_ids)} chunks.")
421
- print(f"Assigned unique 'doc_id' to each chunk in metadata.")
422
- # dimension = 768
423
- # # hnsw_m = 32
424
- # # index = faiss.IndexHNSWFlat(dimension, hnsw_m, faiss.METRIC_INNER_PRODUCT)
425
- # index = faiss.IndexFlatL2(dimension)
426
- # vector_store = FAISS(
427
- # embedding_function=embedding_model.embed_query,
428
- # index=index,
429
- # docstore= InMemoryDocstore(),
430
- # index_to_docstore_id={}
431
- # )
432
- vectorstore = FAISS.from_documents(documents=processed_chunks_with_ids, embedding=embedding_model)
433
- vectorstore.add_documents(processed_chunks_with_ids, ids = [cid.metadata['doc_id'] for cid in processed_chunks_with_ids])
434
- # vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
435
- vectorstore_retriever = vectorstore
436
- msg = f"Successfully processed {len(files)} documents and created knowledge base."
437
- return msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
  # @app.post("/chat", response_model=ChatResponse)
440
  def chat_with_rag(chatdata):
 
359
  return docx_content
360
 
361
 
362
+ def upload_documents(files):
363
+ global vectorstore_retriever
364
+
365
+ embedding_model = init_embed()
366
 
367
+ all_documents = []
368
+ for uploaded_file in files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ if uploaded_file.type == "text/plain":
371
+ # string_data = ( uploaded_file.read()).decode("utf-8")
372
+ string_data = process_text(uploaded_file)
373
+ all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
374
+ elif uploaded_file.type == "application/pdf":
375
+ pdf_text = process_pdf(uploaded_file)
 
 
 
 
 
 
376
 
377
+ # pdf_bytes = io.BytesIO( uploaded_file.read())
378
+ # reader = PyPDF2.PdfReader(pdf_bytes)
379
+ # pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
380
+ all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
 
 
 
 
 
381
 
382
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
383
+ docx_content = process_docx(uploaded_file)
384
 
385
+ # docx_bytes = io.BytesIO( uploaded_file.read())
386
+ # docx_docs = dx(docx_bytes)
387
+ # docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
388
+ all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
389
+ else:
390
+ raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
391
+
392
+ if not all_documents:
393
+ raise Exception(status_code=400, detail="No supported documents uploaded.")
394
+
395
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
396
+ text_chunks = text_splitter.split_documents(all_documents)
397
+ print("text_chucks: ", text_chunks[:100])
398
+
399
+ processed_chunks_with_ids = []
400
+ for i, chunk in enumerate(text_chunks):
401
+ # Generate a unique ID for each chunk
402
+ # Option 1 (Recommended): Using UUID for global uniqueness
403
+ # chunk_id = str(uuid.uuid4())
404
 
405
+ # Option 2 (Alternative): Combining source file path with chunk index
406
+ # This is good if you want IDs to be deterministic based on file/chunk.
407
+ # You might need to make the file path more robust (e.g., hash it or normalize it).
408
+ file_source = chunk.metadata.get('source', 'unknown_source')
409
+ chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
410
+
411
+ # Add the unique ID to the chunk's metadata
412
+ # It's good practice to keep original metadata and just add your custom ID.
413
+ chunk.metadata['doc_id'] = chunk_id
414
+
415
+
416
+ processed_chunks_with_ids.append(chunk)
417
+ # embeddings = [embedding_model.encode(doc_chunks.page_content, convert_to_numpy=True) for doc_chunks in processed_chunks_with_ids]
418
+
419
+ print(f"Split {len(processed_chunks_with_ids)} chunks.")
420
+ print(f"Assigned unique 'doc_id' to each chunk in metadata.")
421
+ # dimension = 768
422
+ # # hnsw_m = 32
423
+ # # index = faiss.IndexHNSWFlat(dimension, hnsw_m, faiss.METRIC_INNER_PRODUCT)
424
+ # index = faiss.IndexFlatL2(dimension)
425
+ # vector_store = FAISS(
426
+ # embedding_function=embedding_model.embed_query,
427
+ # index=index,
428
+ # docstore= InMemoryDocstore(),
429
+ # index_to_docstore_id={}
430
+ # )
431
+ vectorstore = FAISS.from_documents(documents=processed_chunks_with_ids, embedding=embedding_model)
432
+ vectorstore.add_documents(processed_chunks_with_ids, ids = [cid.metadata['doc_id'] for cid in processed_chunks_with_ids])
433
+ # vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
434
+ vectorstore_retriever = vectorstore
435
+ msg = f"Successfully processed {len(files)} documents and created knowledge base."
436
+ return msg
437
 
438
  # @app.post("/chat", response_model=ChatResponse)
439
  def chat_with_rag(chatdata):