Spaces:
Running
Running
removed UploadDocs
Browse files- langgraph_init.py +69 -70
langgraph_init.py
CHANGED
|
@@ -359,82 +359,81 @@ def process_docx(file):
|
|
| 359 |
return docx_content
|
| 360 |
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
all_documents = []
|
| 369 |
-
for uploaded_file in files:
|
| 370 |
-
|
| 371 |
-
if uploaded_file.type == "text/plain":
|
| 372 |
-
# string_data = ( uploaded_file.read()).decode("utf-8")
|
| 373 |
-
string_data = process_text(uploaded_file)
|
| 374 |
-
all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
|
| 375 |
-
elif uploaded_file.type == "application/pdf":
|
| 376 |
-
pdf_text = process_pdf(uploaded_file)
|
| 377 |
-
|
| 378 |
-
# pdf_bytes = io.BytesIO( uploaded_file.read())
|
| 379 |
-
# reader = PyPDF2.PdfReader(pdf_bytes)
|
| 380 |
-
# pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
|
| 381 |
-
all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
|
| 382 |
-
|
| 383 |
-
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 384 |
-
docx_content = process_docx(uploaded_file)
|
| 385 |
-
|
| 386 |
-
# docx_bytes = io.BytesIO( uploaded_file.read())
|
| 387 |
-
# docx_docs = dx(docx_bytes)
|
| 388 |
-
# docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
|
| 389 |
-
all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
|
| 390 |
-
else:
|
| 391 |
-
raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
|
| 392 |
|
| 393 |
-
if
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
processed_chunks_with_ids = []
|
| 401 |
-
for i, chunk in enumerate(text_chunks):
|
| 402 |
-
# Generate a unique ID for each chunk
|
| 403 |
-
# Option 1 (Recommended): Using UUID for global uniqueness
|
| 404 |
-
# chunk_id = str(uuid.uuid4())
|
| 405 |
|
| 406 |
-
#
|
| 407 |
-
#
|
| 408 |
-
#
|
| 409 |
-
|
| 410 |
-
chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
|
| 411 |
-
|
| 412 |
-
# Add the unique ID to the chunk's metadata
|
| 413 |
-
# It's good practice to keep original metadata and just add your custom ID.
|
| 414 |
-
chunk.metadata['doc_id'] = chunk_id
|
| 415 |
|
|
|
|
|
|
|
| 416 |
|
| 417 |
-
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
#
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
#
|
| 427 |
-
#
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
# @app.post("/chat", response_model=ChatResponse)
|
| 440 |
def chat_with_rag(chatdata):
|
|
|
|
| 359 |
return docx_content
|
| 360 |
|
| 361 |
|
| 362 |
+
def upload_documents(files):
|
| 363 |
+
global vectorstore_retriever
|
| 364 |
+
|
| 365 |
+
embedding_model = init_embed()
|
| 366 |
|
| 367 |
+
all_documents = []
|
| 368 |
+
for uploaded_file in files:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
+
if uploaded_file.type == "text/plain":
|
| 371 |
+
# string_data = ( uploaded_file.read()).decode("utf-8")
|
| 372 |
+
string_data = process_text(uploaded_file)
|
| 373 |
+
all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
|
| 374 |
+
elif uploaded_file.type == "application/pdf":
|
| 375 |
+
pdf_text = process_pdf(uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
+
# pdf_bytes = io.BytesIO( uploaded_file.read())
|
| 378 |
+
# reader = PyPDF2.PdfReader(pdf_bytes)
|
| 379 |
+
# pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
|
| 380 |
+
all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
+
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 383 |
+
docx_content = process_docx(uploaded_file)
|
| 384 |
|
| 385 |
+
# docx_bytes = io.BytesIO( uploaded_file.read())
|
| 386 |
+
# docx_docs = dx(docx_bytes)
|
| 387 |
+
# docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
|
| 388 |
+
all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
|
| 389 |
+
else:
|
| 390 |
+
raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
|
| 391 |
+
|
| 392 |
+
if not all_documents:
|
| 393 |
+
raise Exception(status_code=400, detail="No supported documents uploaded.")
|
| 394 |
+
|
| 395 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 396 |
+
text_chunks = text_splitter.split_documents(all_documents)
|
| 397 |
+
print("text_chucks: ", text_chunks[:100])
|
| 398 |
+
|
| 399 |
+
processed_chunks_with_ids = []
|
| 400 |
+
for i, chunk in enumerate(text_chunks):
|
| 401 |
+
# Generate a unique ID for each chunk
|
| 402 |
+
# Option 1 (Recommended): Using UUID for global uniqueness
|
| 403 |
+
# chunk_id = str(uuid.uuid4())
|
| 404 |
|
| 405 |
+
# Option 2 (Alternative): Combining source file path with chunk index
|
| 406 |
+
# This is good if you want IDs to be deterministic based on file/chunk.
|
| 407 |
+
# You might need to make the file path more robust (e.g., hash it or normalize it).
|
| 408 |
+
file_source = chunk.metadata.get('source', 'unknown_source')
|
| 409 |
+
chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
|
| 410 |
+
|
| 411 |
+
# Add the unique ID to the chunk's metadata
|
| 412 |
+
# It's good practice to keep original metadata and just add your custom ID.
|
| 413 |
+
chunk.metadata['doc_id'] = chunk_id
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
processed_chunks_with_ids.append(chunk)
|
| 417 |
+
# embeddings = [embedding_model.encode(doc_chunks.page_content, convert_to_numpy=True) for doc_chunks in processed_chunks_with_ids]
|
| 418 |
+
|
| 419 |
+
print(f"Split {len(processed_chunks_with_ids)} chunks.")
|
| 420 |
+
print(f"Assigned unique 'doc_id' to each chunk in metadata.")
|
| 421 |
+
# dimension = 768
|
| 422 |
+
# # hnsw_m = 32
|
| 423 |
+
# # index = faiss.IndexHNSWFlat(dimension, hnsw_m, faiss.METRIC_INNER_PRODUCT)
|
| 424 |
+
# index = faiss.IndexFlatL2(dimension)
|
| 425 |
+
# vector_store = FAISS(
|
| 426 |
+
# embedding_function=embedding_model.embed_query,
|
| 427 |
+
# index=index,
|
| 428 |
+
# docstore= InMemoryDocstore(),
|
| 429 |
+
# index_to_docstore_id={}
|
| 430 |
+
# )
|
| 431 |
+
vectorstore = FAISS.from_documents(documents=processed_chunks_with_ids, embedding=embedding_model)
|
| 432 |
+
vectorstore.add_documents(processed_chunks_with_ids, ids = [cid.metadata['doc_id'] for cid in processed_chunks_with_ids])
|
| 433 |
+
# vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
|
| 434 |
+
vectorstore_retriever = vectorstore
|
| 435 |
+
msg = f"Successfully processed {len(files)} documents and created knowledge base."
|
| 436 |
+
return msg
|
| 437 |
|
| 438 |
# @app.post("/chat", response_model=ChatResponse)
|
| 439 |
def chat_with_rag(chatdata):
|