import datetime import os import gradio as gr from chain import get_new_chain1 import os import langchain # logging.basicConfig(stream=sys.stdout, level=logging.INFO) # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter from langchain.document_loaders import TextLoader from abc import ABC from typing import List, Optional, Any import chromadb from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.vectorstores import Chroma class CachedChroma(Chroma, ABC): """ Wrapper around Chroma to make caching embeddings easier. It automatically uses a cached version of a specified collection, if available. Example: .. code-block:: python from langchain.vectorstores import Chroma from langchain.embeddings.openai import OpenAIEmbeddings embeddings = OpenAIEmbeddings() vectorstore = CachedChroma.from_documents_with_cache( ".persisted_data", texts, embeddings, collection_name="fun_experiement" ) """ @classmethod def from_documents_with_cache( cls, persist_directory: str, documents: List[Document], embedding: Optional[Embeddings] = None, ids: Optional[List[str]] = None, collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME, client_settings: Optional[chromadb.config.Settings] = None, **kwargs: Any, ) -> Chroma: settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=persist_directory ) client = chromadb.Client(settings) collection_names = [c.name for c in client.list_collections()] if collection_name in collection_names: return Chroma( collection_name=collection_name, embedding_function=embedding, persist_directory=persist_directory, client_settings=client_settings, ) return Chroma.from_documents( documents=documents, embedding=embedding, ids=ids, collection_name=collection_name, persist_directory=persist_directory, client_settings=client_settings, **kwargs ) def get_docs(): local_repo_path_1 = "pycbc/" loaders = [] docs = [] for root, dirs, files in os.walk(local_repo_path_1): for file in files: file_path = os.path.join(root, file) rel_file_path = os.path.relpath(file_path, local_repo_path_1) # Filter by file extension if any(rel_file_path.endswith(ext) for ext in [".py", ".sh"]): # Filter by directory if any(rel_file_path.startswith(d) for d in ["pycbc/", "examples/"]): docs.append(rel_file_path) if any(rel_file_path.startswith(d) for d in ["bin/"]): docs.append(rel_file_path) loaders.extend([TextLoader(os.path.join(local_repo_path_1, doc)).load() for doc in docs]) py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = [] for load in loaders: try: if load[0].metadata['source'][-3:] == ".py" == "" or "pycbc/bin/" in load[0].metadata['source']: documents.extend(py_splitter.split_documents(load)) except Exception as e: documents.extend(text_splitter.split_documents(load)) return documents def set_chain_up(api_key, model_selector, k_textbox, agent): if api_key: os.environ["OPENAI_API_KEY"] = api_key documents = get_docs() embeddings = OpenAIEmbeddings() vectorstore = CachedChroma.from_documents_with_cache(".persisted_data", documents, embedding=embeddings, collection_name="pycbc") if not model_selector: model_selector = "gpt-3.5-turbo" if not k_textbox: k_textbox = 10 else: k_textbox = int(k_textbox) qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox) os.environ["OPENAI_API_KEY"] = "" return qa_chain def chat(inp, history, agent): history = history or [] if agent is None: history.append((inp, "Please paste your OpenAI key to use")) return history, history print("\n==== date/time: " + str(datetime.datetime.now()) + " ====") print("inp: " + inp) history = history or [] output = agent({"question": inp, "chat_history": history}) answer = output["answer"] history.append((inp, answer)) print(history) return history, history block = gr.Blocks(css=".gradio-container {background-color: lightgray}") with block: with gr.Row(): gr.Markdown("