Spaces:

JustusI
/

data_roles

Sleeping

App Files Files Community

JustusI commited on Jul 12, 2024

Commit

61caa6e

verified ·

1 Parent(s): 2285b10

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -63

app.py CHANGED Viewed

@@ -10,52 +10,38 @@ from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Function to load and process data
-def load_data(file_path):
-    df = pd.read_csv(file_path)
-    return df
-# Function to load documents from DataFrame
-def load_documents(df, content_column):
-    docs = DataFrameLoader(df, page_content_column=content_column).load()
-    return docs
-# Function to tokenize documents
-# def tokenize_documents(docs):
-#     encoder = tiktoken.get_encoding("cl100k_base")
-#     tokens_per_docs = [len(encoder.encode(doc.page_content)) for doc in docs]
-#     total_tokens = sum(tokens_per_docs)
-#     cost_per_1000_tokens = 0.0001
-#     cost = (total_tokens / 1000) * cost_per_1000_tokens
-#     return tokens_per_docs, cost
-# Function to create vector database
-def create_vector_db(docs):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    texts = text_splitter.split_documents(docs)
     embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    vectordb = Chroma.from_documents(docs, embedding_function,persist_directory='./chroma_db')
-    vectordb.persist()
-    vectordb = None
-    vectordb = Chroma(persist_directory=vectordb, embedding_function=embedding_function)
     return vectordb
 # Function to augment prompt
 def augment_prompt(query, vectordb):
     results = vectordb.similarity_search(query, k=3)
     source_knowledge = "\n".join([x.page_content for x in results])
-    augmented_prompt = f"""Using the contexts below, answer the query. If some information is not provided within
-    the contexts below, do not include, and if the query cannot be answered with the below information, say "I don't know".
-    Contexts:
     {source_knowledge}
-    Query: {query}"""
     return augmented_prompt
-# Function to handle chat
-def chat_with_ai(query, vectordb,openai_api_key):
-    chat = ChatOpenAI(model_name="gpt-3.5-turbo",openai_api_key=openai_api_key)
     augmented_query = augment_prompt(query, vectordb)
     prompt = HumanMessage(content=augmented_query)
     messages = [
@@ -68,33 +54,17 @@ def chat_with_ai(query, vectordb,openai_api_key):
 # Streamlit UI
 st.title("Document Processing and AI Chat with LangChain")
-# File upload
-uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-if uploaded_file is not None:
-    # Load and process data
-    df = load_data(uploaded_file)
-    st.write("Data loaded successfully!")
-    # Load documents
-    docs = load_documents(df, 'page_content')
-    st.write(f"Loaded {len(docs)} documents")
-    # Tokenize documents
-    # tokens_per_docs, cost = tokenize_documents(docs)
-    # st.write(f"Total tokens: {sum(tokens_per_docs)}")
-    # st.write(f"Estimated cost: ${cost:.4f}")
-    # Create vector database
-    vectordb = create_vector_db(docs)
-    st.write("Vector database created and persisted successfully!")
-    # Query input
-    query = st.text_input("Enter your query", "Recommend a company to work as a data scientist in the health sector")
-    if st.button("Get Answer"):
-        # Chat with AI
-        openai_api_key = os.getenv("OPENAI_API_KEY")
-        response = chat_with_ai(query, vectordb, openai_api_key)
-        st.write("Response from AI:")
-        st.write(response)

 from langchain_openai import ChatOpenAI
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Function to load vector database
+def load_vector_db(zip_file_path, extract_path):
+    with st.spinner("Loading vector store..."):
+        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_path)
     embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    vectordb = Chroma(
+        persist_directory=extract_path,
+        embedding_function=embedding_function
+    )
+    st.success("Vector store loaded")
     return vectordb
 # Function to augment prompt
 def augment_prompt(query, vectordb):
     results = vectordb.similarity_search(query, k=3)
     source_knowledge = "\n".join([x.page_content for x in results])
+    augmented_prompt = f"""
+    You are an AI assistant. Use the context provided below to answer the question as comprehensively as possible.
+    If the answer is not contained within the context, respond with "I don't know".
+    Context:
     {source_knowledge}
+    Question: {query}
+    """
     return augmented_prompt
+# Function to handle chat with OpenAI
+def chat_with_openai(query, vectordb, openai_api_key):
+    chat = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=openai_api_key)
     augmented_query = augment_prompt(query, vectordb)
     prompt = HumanMessage(content=augmented_query)
     messages = [
 # Streamlit UI
 st.title("Document Processing and AI Chat with LangChain")
+# Load vector database
+zip_file_path = "chroma_db_compressed_.zip"
+extract_path = "./chroma_db_extracted"
+vectordb = load_vector_db(zip_file_path, extract_path)
+# Query input
+query = st.text_input("Enter your query", "Recommend a company to work as a data scientist in the health sector")
+if st.button("Get Answer"):
+    # Chat with OpenAI
+    openai_api_key = st.secrets["OPENAI_API_KEY"]
+    response = chat_with_openai(query, vectordb, openai_api_key)
+    st.write("Response from AI:")
+    st.write(response)