Spaces:

ugaray96
/

neural-search

Runtime error

App Files Files Community

ugmSorcero commited on Sep 23, 2022

Commit

cfc1673

2 Parent(s): 42468fb c9524e4

Merge branch 'main' into feature/audio_output

Browse files

Files changed (9) hide show

.gitignore +1 -3
.streamlit/config.toml +1 -1
app.py +1 -1
core/pipelines.py +13 -4
core/search_index.py +7 -1
interface/components.py +21 -24
interface/config.py +5 -1
interface/pages.py +6 -3
requirements.txt +1 -5

.gitignore CHANGED Viewed

@@ -128,6 +128,4 @@ dmypy.json
 # Pyre type checker
 .pyre/
-.vscode/
-data/audio/

 # Pyre type checker
 .pyre/
+.vscode/

.streamlit/config.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [theme]
-primaryColor="#ffbf00"
 backgroundColor="#0e1117"
 secondaryBackgroundColor="#282929"
 textColor = "#ffffff"

 [theme]
+primaryColor="#e5ab00"
 backgroundColor="#0e1117"
 secondaryBackgroundColor="#282929"
 textColor = "#ffffff"

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ def run_demo():
     with navigation:
         selected_page = option_menu(
-            menu_title="Navigation",
             options=list(pages.keys()),
             icons=[f[1] for f in pages.values()],
             menu_icon="cast",

     with navigation:
         selected_page = option_menu(
+            menu_title=None,
             options=list(pages.keys()),
             icons=[f[1] for f in pages.values()],
             menu_icon="cast",

core/pipelines.py CHANGED Viewed

@@ -25,6 +25,8 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -45,10 +47,7 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
     index_pipeline = Pipeline()
     index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
     index_pipeline.add_node(
-        keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"]
-    )
-    index_pipeline.add_node(
-        document_store, name="DocumentStore", inputs=["TfidfRetriever"]
     )
     if audio_output:
@@ -68,6 +67,7 @@ def dense_passage_retrieval(
     split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
 ):
     """
     **Dense Passage Retrieval Pipeline**
@@ -104,6 +104,15 @@ def dense_passage_retrieval(
     index_pipeline.add_node(
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
     return search_pipeline, index_pipeline

       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
+    :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
     index_pipeline = Pipeline()
     index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
     index_pipeline.add_node(
+        document_store, name="DocumentStore", inputs=["Preprocessor"]
     )
     if audio_output:
     split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+    audio_output=False
 ):
     """
     **Dense Passage Retrieval Pipeline**
     index_pipeline.add_node(
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
+    if audio_output:
+        doc2speech = DocumentToSpeech(
+            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(data_path + "audio"),
+        )
+        search_pipeline.add_node(
+            doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
+        )
     return search_pipeline, index_pipeline

core/search_index.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from haystack.schema import Document
 import uuid
@@ -17,8 +18,12 @@ def format_docs(documents):
     return db_docs, [doc.meta["id"] for doc in db_docs]
-def index(documents, pipeline):
     documents, doc_ids = format_docs(documents)
     pipeline.run(documents=documents)
     return doc_ids
@@ -36,6 +41,7 @@ def search(queries, pipeline):
                 "text": res.content,
                 "id": res.meta["id"],
                 "fragment_id": res.id,
             }
             if not score_is_empty:
                 match.update({"score": res.score})

 from haystack.schema import Document
+from haystack.document_stores import BaseDocumentStore
 import uuid
     return db_docs, [doc.meta["id"] for doc in db_docs]
+def index(documents, pipeline, clear_index=True):
     documents, doc_ids = format_docs(documents)
+    if clear_index:
+        document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
+        for docstore in document_stores:
+            docstore.delete_index(docstore.index)
     pipeline.run(documents=documents)
     return doc_ids
                 "text": res.content,
                 "id": res.meta["id"],
                 "fragment_id": res.id,
+                "meta": res.meta,
             }
             if not score_is_empty:
                 match.update({"score": res.score})

interface/components.py CHANGED Viewed

@@ -42,11 +42,15 @@ def component_select_pipeline(container):
                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
-    with st.expander("Show pipeline"):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,6 +63,8 @@ def component_show_search_result(container, results):
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
             if "score" in document:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             if "content_audio" in document:
@@ -66,36 +72,32 @@ def component_show_search_result(container, results):
             st.markdown("---")
-def component_text_input(container):
     """Draw the Text Input widget"""
     with container:
         texts = []
-        doc_id = 1
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
-                    texts.append({"text": text})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
-        ]
-        return corpus
-def component_article_url(container):
     """Draw the Article URL widget"""
     with container:
         urls = []
-        doc_id = 1
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
-                    urls.append({"text": extract_text_from_url(url)})
                     doc_id += 1
                     st.markdown("---")
                 else:
@@ -103,19 +105,16 @@ def component_article_url(container):
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
-                st.write(doc)
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
-        ]
-        return corpus
-def component_file_input(container):
     """Draw the extract text from file widget"""
     with container:
         files = []
-        doc_id = 1
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
@@ -124,7 +123,7 @@ def component_file_input(container):
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
-                        files.append({"text": extracted_text})
                         doc_id += 1
                         st.markdown("---")
                     else:
@@ -134,9 +133,7 @@ def component_file_input(container):
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
-                st.write(doc)
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
-        ]
-        return corpus

                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
+            st.session_state["doc_id"] = 0
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
+    expander_text = "Show pipeline"
+    if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
+        expander_text += "  ⚠️"
+    with st.expander(expander_text):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
+            if "_split_id" in document["meta"]:
+                st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
             if "score" in document:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             if "content_audio" in document:
             st.markdown("---")
+def component_text_input(container, doc_id):
     """Draw the Text Input widget"""
     with container:
         texts = []
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
+                    texts.append({"text": text, "doc_id": doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in texts]
+        return corpus, doc_id
+def component_article_url(container, doc_id):
     """Draw the Article URL widget"""
     with container:
         urls = []
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
+                    urls.append({"text": extract_text_from_url(url), "doc_id": doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
+                st.write(doc["text"])
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in urls]
+        return corpus, doc_id
+def component_file_input(container, doc_id):
     """Draw the extract text from file widget"""
     with container:
         files = []
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
+                        files.append({"text": extracted_text, "doc_id": doc_id})
                         doc_id += 1
                         st.markdown("---")
                     else:
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
+                st.write(doc["text"])
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in files]
+        return corpus, doc_id

interface/config.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
-session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 # Define Pages for the demo
 pages = {

 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
+session_state_variables = {
+    "pipeline": None,
+    "pipeline_func_parameters": [],
+    "doc_id": 0,
+}
 # Define Pages for the demo
 pages = {

interface/pages.py CHANGED Viewed

@@ -79,14 +79,17 @@ def page_index(container):
             orientation="horizontal",
         )
-        corpus = input_funcs[selected_input][0](container)
         if len(corpus) > 0:
             index_results = None
             if st.button("Index"):
                 index_results = index(
-                    corpus,
-                    st.session_state["pipeline"]["index_pipeline"],
                 )
             if index_results:
                 st.write(index_results)

             orientation="horizontal",
         )
+        clear_index = st.sidebar.checkbox("Clear Index", True)
+        doc_id = st.session_state["doc_id"]
+        corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
         if len(corpus) > 0:
             index_results = None
             if st.button("Index"):
                 index_results = index(
+                    corpus, st.session_state["pipeline"]["index_pipeline"], clear_index
                 )
+                st.session_state["doc_id"] = doc_id
             if index_results:
                 st.write(index_results)

requirements.txt CHANGED Viewed

@@ -5,8 +5,4 @@ black==22.8.0
 plotly==5.10.0
 newspaper3k==0.2.8
 PyPDF2==2.10.7
-pytesseract==0.3.10
-soundfile==0.10.3.post1
-espnet
-pydub==0.25.1
-espnet_model_zoo==0.1.7

 plotly==5.10.0
 newspaper3k==0.2.8
 PyPDF2==2.10.7
+pytesseract==0.3.10