Spaces:
Sleeping
Sleeping
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from typing import List | |
| from langchain_core.documents import Document | |
| import os | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len) | |
| embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
| vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function) | |
| def load_and_split_document(file_path: str) -> List[Document]: | |
| if file_path.endswith('.pdf'): | |
| loader = PyPDFLoader(file_path) | |
| elif file_path.endswith('.docx'): | |
| loader = Docx2txtLoader(file_path) | |
| elif file_path.endswith('.html'): | |
| loader = UnstructuredHTMLLoader(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_path}") | |
| documents = loader.load() | |
| return text_splitter.split_documents(documents) | |
| def index_document_to_chroma(file_path: str, file_id: int) -> bool: | |
| try: | |
| splits = load_and_split_document(file_path) | |
| # Add metadata to each split | |
| for split in splits: | |
| split.metadata['file_id'] = file_id | |
| vectorstore.add_documents(splits) | |
| # vectorstore.persist() | |
| return True | |
| except Exception as e: | |
| print(f"Error indexing document: {e}") | |
| return False | |
| def delete_doc_from_chroma(file_id: int): | |
| try: | |
| docs = vectorstore.get(where={"file_id": file_id}) | |
| print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}") | |
| vectorstore._collection.delete(where={"file_id": file_id}) | |
| print(f"Deleted all documents with file_id {file_id}") | |
| return True | |
| except Exception as e: | |
| print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}") | |
| return False |