Spaces:

intoxication
/

WbRules

Runtime error

App Files Files Community

intoxication commited on Sep 11, 2023

Commit

df118cf

1 Parent(s): 209df17

Update utils/haystack.py

Browse files

Files changed (1) hide show

utils/haystack.py +66 -16

utils/haystack.py CHANGED Viewed

@@ -1,22 +1,72 @@
 import streamlit as st
-from haystack import Pipeline
-from haystack.schema import Answer
-#Use this file to set up your Haystack pipeline and querying
-# cached to make index and models load only at start
-@st.cache_resource(show_spinner=False)
-def start_haystack():
-    #Use this function to contruct a pipeline
-    pipe = Pipeline()
-    return pipe
-pipe = start_haystack()
-@st.cache_data(show_spinner=True)
-def query(question):
-    print("Received question")
-    params = {}
-    # results = pipe.run(question, params=params)
-    return [Answer(answer="results", context="Call  pipe.run(question, params=params) and return results in /utils/haystack.py query()")]

 import streamlit as st
+import logging
+import pandas as pd
+from haystack.utils import print_answers
+from haystack.pipelines import Pipeline
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.nodes import EmbeddingRetriever
+from haystack.nodes.other.docs2answers import Docs2Answers
+from haystack.utils import launch_es, fetch_archive_from_http
+# Initialize logging
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
+# Launch Elasticsearch
+launch_es()
+# Initialize the Haystack pipeline and document store
+document_store = ElasticsearchDocumentStore(
+    host="localhost",
+    username="",
+    password="",
+    index="document",
+    embedding_field="question_emb",
+    embedding_dim=384,
+    excluded_meta_data=["question_emb"],
+    similarity="cosine",
+)
+retriever = EmbeddingRetriever(
+    document_store=document_store,
+    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+    use_gpu=True,
+    scale_score=False,
+)
+doc_to_answers = Docs2Answers()
+doc_dir = "data/basic_faq_pipeline"
+s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
+df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+df["question"] = df["question"].apply(lambda x: x.strip())
+# Get embeddings for our questions from the FAQs
+questions = list(df["question"].values)
+df["question_emb"] = retriever.embed_queries(queries=questions).tolist()
+df = df.rename(columns={"question": "content"})
+# Convert Dataframe to list of dicts and index them in our DocumentStore
+docs_to_index = df.to_dict(orient="records")
+document_store.write_documents(docs_to_index)
+# Initialize a Pipeline (this time without a reader) and ask questions
+pipeline = Pipeline()
+pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
+pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"])
+# Create the Streamlit app
+st.title("FAQ Search")
+question = st.text_input("Ask a question:")
+if question:
+    params = {"Retriever": {"top_k": 10}}  # Modify parameters as needed
+    prediction = pipeline.run(query=question, params=params)
+    st.subheader("Answers:")
+    print_answers(prediction, details="medium")