Spaces:

SRA25
/

ragchatbot

Sleeping

App Files Files Community

SRA25 commited on Sep 29, 2025

Commit

028d4a9

verified ·

1 Parent(s): 07c9a11

removed UploadDocs

Browse files

Files changed (1) hide show

langgraph_init.py +69 -70

langgraph_init.py CHANGED Viewed

@@ -359,82 +359,81 @@ def process_docx(file):
     return docx_content
-class UploadDocs:
-    def upload_documents(files):
-        global vectorstore_retriever
-        embedding_model = init_embed()
-        all_documents = []
-        for uploaded_file in files:
-            if uploaded_file.type == "text/plain":
-                # string_data = ( uploaded_file.read()).decode("utf-8")
-                string_data = process_text(uploaded_file)
-                all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
-            elif uploaded_file.type == "application/pdf":
-                pdf_text = process_pdf(uploaded_file)
-                # pdf_bytes = io.BytesIO( uploaded_file.read())
-                # reader = PyPDF2.PdfReader(pdf_bytes)
-                # pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
-                all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
-            elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-                docx_content = process_docx(uploaded_file)
-                # docx_bytes = io.BytesIO( uploaded_file.read())
-                # docx_docs = dx(docx_bytes)
-                # docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
-                all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
-            else:
-                raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
-        if not all_documents:
-            raise Exception(status_code=400, detail="No supported documents uploaded.")
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-        text_chunks = text_splitter.split_documents(all_documents)
-        print("text_chucks: ", text_chunks[:100])
-        processed_chunks_with_ids = []
-        for i, chunk in enumerate(text_chunks):
-            # Generate a unique ID for each chunk
-            # Option 1 (Recommended): Using UUID for global uniqueness
-            # chunk_id = str(uuid.uuid4())
-            # Option 2 (Alternative): Combining source file path with chunk index
-            # This is good if you want IDs to be deterministic based on file/chunk.
-            # You might need to make the file path more robust (e.g., hash it or normalize it).
-            file_source = chunk.metadata.get('source', 'unknown_source')
-            chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
-            # Add the unique ID to the chunk's metadata
-            # It's good practice to keep original metadata and just add your custom ID.
-            chunk.metadata['doc_id'] = chunk_id
-            processed_chunks_with_ids.append(chunk)
-        # embeddings = [embedding_model.encode(doc_chunks.page_content, convert_to_numpy=True) for doc_chunks in processed_chunks_with_ids]
-        print(f"Split {len(processed_chunks_with_ids)} chunks.")
-        print(f"Assigned unique 'doc_id' to each chunk in metadata.")
-        # dimension = 768
-        # # hnsw_m = 32
-        # # index = faiss.IndexHNSWFlat(dimension, hnsw_m, faiss.METRIC_INNER_PRODUCT)
-        # index = faiss.IndexFlatL2(dimension)
-        # vector_store = FAISS(
-        #     embedding_function=embedding_model.embed_query,
-        #     index=index,
-        #     docstore= InMemoryDocstore(),
-        #     index_to_docstore_id={}
-        # )
-        vectorstore = FAISS.from_documents(documents=processed_chunks_with_ids, embedding=embedding_model)
-        vectorstore.add_documents(processed_chunks_with_ids, ids = [cid.metadata['doc_id'] for cid in processed_chunks_with_ids])
-        # vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
-        vectorstore_retriever = vectorstore
-        msg = f"Successfully processed {len(files)} documents and created knowledge base."
-        return msg
 # @app.post("/chat", response_model=ChatResponse)
 def chat_with_rag(chatdata):

     return docx_content
+def upload_documents(files):
+    global vectorstore_retriever
+    embedding_model = init_embed()
+    all_documents = []
+    for uploaded_file in files:
+        if uploaded_file.type == "text/plain":
+            # string_data = ( uploaded_file.read()).decode("utf-8")
+            string_data = process_text(uploaded_file)
+            all_documents.append(Document(page_content=string_data, metadata={"source": uploaded_file.name}))
+        elif uploaded_file.type == "application/pdf":
+            pdf_text = process_pdf(uploaded_file)
+            # pdf_bytes = io.BytesIO( uploaded_file.read())
+            # reader = PyPDF2.PdfReader(pdf_bytes)
+            # pdf_text = "".join([page.extract_text() + "\n" for page in reader.pages])
+            all_documents.append(Document(page_content=pdf_text, metadata={"source": uploaded_file.name}))
+        elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+            docx_content = process_docx(uploaded_file)
+            # docx_bytes = io.BytesIO( uploaded_file.read())
+            # docx_docs = dx(docx_bytes)
+            # docx_content = "\n".join([para.text for para in docx_docs.paragraphs])
+            all_documents.append(Document(page_content=docx_content, metadata={"source": uploaded_file.name}))
+        else:
+            raise Exception(status_code=400, detail=f"Unsupported file type: {uploaded_file.name} ({uploaded_file.type})")
+    if not all_documents:
+        raise Exception(status_code=400, detail="No supported documents uploaded.")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    text_chunks = text_splitter.split_documents(all_documents)
+    print("text_chucks: ", text_chunks[:100])
+    processed_chunks_with_ids = []
+    for i, chunk in enumerate(text_chunks):
+        # Generate a unique ID for each chunk
+        # Option 1 (Recommended): Using UUID for global uniqueness
+        # chunk_id = str(uuid.uuid4())
+        # Option 2 (Alternative): Combining source file path with chunk index
+        # This is good if you want IDs to be deterministic based on file/chunk.
+        # You might need to make the file path more robust (e.g., hash it or normalize it).
+        file_source = chunk.metadata.get('source', 'unknown_source')
+        chunk_id = f"{file_source.replace('.','_')}_chunk_{i}"
+        # Add the unique ID to the chunk's metadata
+        # It's good practice to keep original metadata and just add your custom ID.
+        chunk.metadata['doc_id'] = chunk_id
+        processed_chunks_with_ids.append(chunk)
+    # embeddings = [embedding_model.encode(doc_chunks.page_content, convert_to_numpy=True) for doc_chunks in processed_chunks_with_ids]
+    print(f"Split {len(processed_chunks_with_ids)} chunks.")
+    print(f"Assigned unique 'doc_id' to each chunk in metadata.")
+    # dimension = 768
+    # # hnsw_m = 32
+    # # index = faiss.IndexHNSWFlat(dimension, hnsw_m, faiss.METRIC_INNER_PRODUCT)
+    # index = faiss.IndexFlatL2(dimension)
+    # vector_store = FAISS(
+    #     embedding_function=embedding_model.embed_query,
+    #     index=index,
+    #     docstore= InMemoryDocstore(),
+    #     index_to_docstore_id={}
+    # )
+    vectorstore = FAISS.from_documents(documents=processed_chunks_with_ids, embedding=embedding_model)
+    vectorstore.add_documents(processed_chunks_with_ids, ids = [cid.metadata['doc_id'] for cid in processed_chunks_with_ids])
+    # vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
+    vectorstore_retriever = vectorstore
+    msg = f"Successfully processed {len(files)} documents and created knowledge base."
+    return msg
 # @app.post("/chat", response_model=ChatResponse)
 def chat_with_rag(chatdata):