Spaces:

NeuML
/

txtai

Running

App Files Files Community

davidmezzetti commited on Jan 17, 2022

Commit

5ade8fe

1 Parent(s): f689d08

Update app.py

Browse files

Files changed (1) hide show

app.py +283 -217

app.py CHANGED Viewed

@@ -5,8 +5,6 @@ Based on this example: https://github.com/neuml/txtai/blob/master/examples/workf
 """
 import os
-import re
-import uuid
 import nltk
 import yaml
@@ -19,18 +17,36 @@ from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translati
 from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
-class Application:
     """
-    Main application.
     """
-    def __init__(self, directory):
         """
-        Creates a new application.
         """
-        # Workflow configuration directory
-        self.directory = directory
         # Component options
         self.components = {}
@@ -46,38 +62,199 @@ class Application:
         self.documents = None
         self.data = None
-        # Workflow run id
-        self.runid = None
     def default(self, names):
         """
         Gets default workflow index.
         Args:
             names: list of workflow names
         Returns:
            default workflow index
         """
         # Get names as lowercase to match case-insensitive
         lnames = [name.lower() for name in names]
         # Get default workflow param
         params = st.experimental_get_query_params()
         index = params.get("default")
         index = index[0].lower() if index else 0
         # Lookup index of workflow name, add 1 to account for "--"
         if index and index in lnames:
             return lnames.index(index) + 1
         # Workflow not found, default to index 0
         return 0
     def load(self, components):
         """
-        Load an existing workflow file.
         Args:
             components: list of components to load
@@ -86,7 +263,7 @@ class Application:
             (names of components loaded, workflow config)
         """
-        with open(os.path.join(self.directory, "config.yml")) as f:
             config = yaml.safe_load(f)
         names = [row["name"] for row in config]
@@ -95,7 +272,7 @@ class Application:
         selected = st.selectbox("Load workflow", ["--"] + names, self.default(names))
         if selected != "--":
             index = [x for x, name in enumerate(names) if name == selected][0]
-            with open(os.path.join(self.directory, files[index])) as f:
                 workflow = yaml.safe_load(f)
             st.markdown("---")
@@ -165,12 +342,13 @@ class Application:
         return config.get(name, default) if config else default
-    def text(self, label, config, name, default=None):
         """
         Create a new text input field.
         Args:
             label: field label
             config: component configuration
             name: setting name
             default: default setting value
@@ -187,14 +365,15 @@ class Application:
         elif isinstance(default, dict):
             default = ",".join(default.keys())
-        return st.text_input(label, value=default)
-    def number(self, label, config, name, default=None):
         """
         Creates a new numeric input field.
         Args:
             label: field label
             config: component configuration
             name: setting name
             default: default setting value
@@ -203,15 +382,16 @@ class Application:
             numeric value
         """
-        value = self.text(label, config, name, default)
         return int(value) if value else None
-    def boolean(self, label, config, name, default=False):
         """
         Creates a new checkbox field.
         Args:
             label: field label
             config: component configuration
             name: setting name
             default: default setting value
@@ -221,14 +401,15 @@ class Application:
         """
         default = self.setting(config, name, default)
-        return st.checkbox(label, value=default)
-    def select(self, label, config, name, options, default=0):
         """
         Creates a new select box field.
         Args:
             label: field label
             config: component configuration
             name: setting name
             options: list of dropdown options
@@ -244,7 +425,7 @@ class Application:
         # Derive default index
         default = index[0] if index else default
-        return st.selectbox(label, options, index=default)
     def split(self, text):
         """
@@ -274,8 +455,6 @@ class Application:
         # pylint: disable=R0912, R0915
         options = {"type": component}
-        st.markdown("---")
         # Lookup component configuration
         #   - Runtime components have config defined within tasks
         #   - Pipeline components have config defined at workflow root
@@ -292,8 +471,9 @@ class Application:
         if component == "embeddings":
             st.markdown("**Embeddings Index**  \n*Index workflow output*")
-            options["path"] = self.text("Embeddings model path", config, "path", "sentence-transformers/nli-mpnet-base-v2")
-            options["upsert"] = self.boolean("Upsert", config, "upsert")
         elif component in ("segmentation", "textractor"):
             if component == "segmentation":
@@ -301,19 +481,19 @@ class Application:
             else:
                 st.markdown("**Textract**  \n*Extract text from documents*")
-            options["sentences"] = self.boolean("Split sentences", config, "sentences")
-            options["lines"] = self.boolean("Split lines", config, "lines")
-            options["paragraphs"] = self.boolean("Split paragraphs", config, "paragraphs")
-            options["join"] = self.boolean("Join tokenized", config, "join")
-            options["minlength"] = self.number("Min section length", config, "minlength")
         elif component == "service":
             st.markdown("**Service**  \n*Extract data from an API*")
-            options["url"] = self.text("URL", config, "url")
-            options["method"] = self.select("Method", config, "method", ["get", "post"], 0)
-            options["params"] = self.text("URL parameters", config, "params")
-            options["batch"] = self.boolean("Run as batch", config, "batch", True)
-            options["extract"] = self.text("Subsection(s) to extract", config, "extract")
             if options["params"]:
                 options["params"] = {key: None for key in self.split(options["params"])}
@@ -322,71 +502,30 @@ class Application:
         elif component == "summary":
             st.markdown("**Summary**  \n*Abstractive text summarization*")
-            options["path"] = self.text("Model", config, "path", "sshleifer/distilbart-cnn-12-6")
-            options["minlength"] = self.number("Min length", config, "minlength")
-            options["maxlength"] = self.number("Max length", config, "maxlength")
         elif component == "tabular":
             st.markdown("**Tabular**  \n*Split tabular data into rows and columns*")
-            options["idcolumn"] = self.text("Id columns", config, "idcolumn")
-            options["textcolumns"] = self.text("Text columns", config, "textcolumns")
             if options["textcolumns"]:
                 options["textcolumns"] = self.split(options["textcolumns"])
         elif component == "translation":
             st.markdown("**Translate**  \n*Machine translation*")
-            options["target"] = self.text("Target language code", config, "args", "en")
         return options
-    def build(self, components):
-        """
-        Builds a workflow using components.
-        Args:
-            components: list of components to add to workflow
-        """
-        # Clear application
-        self.__init__(self.directory)
-        # pylint: disable=W0108
-        tasks = []
-        for component in components:
-            component = dict(component)
-            wtype = component.pop("type")
-            self.components[wtype] = component
-            if wtype == "embeddings":
-                self.embeddings = Embeddings({**component})
-                self.documents = Documents()
-                tasks.append(Task(self.documents.add, unpack=False))
-            elif wtype == "segmentation":
-                self.pipelines[wtype] = Segmentation(**self.components[wtype])
-                tasks.append(Task(self.pipelines[wtype]))
-            elif wtype == "service":
-                tasks.append(ServiceTask(**self.components[wtype]))
-            elif wtype == "summary":
-                self.pipelines[wtype] = Summary(component.pop("path"))
-                tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))
-            elif wtype == "tabular":
-                self.pipelines[wtype] = Tabular(**self.components[wtype])
-                tasks.append(Task(self.pipelines[wtype]))
-            elif wtype == "textractor":
-                self.pipelines[wtype] = Textractor(**self.components[wtype])
-                tasks.append(UrlTask(self.pipelines[wtype]))
-            elif wtype == "translation":
-                self.pipelines[wtype] = Translation()
-                tasks.append(Task(lambda x: self.pipelines["translation"](x, **self.components["translation"])))
-        self.workflow = Workflow(tasks)
     def yaml(self, components):
         """
         Builds a yaml string for components.
@@ -398,7 +537,6 @@ class Application:
             (workflow name, YAML string)
         """
-        # pylint: disable=W0108
         data = {"app": {"data": self.state("data"), "query": self.state("query")}}
         tasks = []
         name = None
@@ -446,111 +584,75 @@ class Application:
         return (name, yaml.dump(data))
-    def find(self, key):
         """
-        Lookup record from cached data by uid key.
         Args:
-            key: uid to search for
         Returns:
-            text for matching uid
         """
-        text = [text for uid, text, _ in self.data if uid == key][0]
-        if key and key.lower().startswith("http"):
-            return "<a href='%s' rel='noopener noreferrer' target='blank'>%s</a>" % (key, text)
-        return text
-    def process(self, data, workflow):
         """
-        Processes the current application action.
         Args:
-            data: input data
             workflow: workflow configuration
-        """
-        if data and self.workflow:
-            # Build tuples for embedding index
-            if self.documents:
-                data = [(x, element, None) for x, element in enumerate(data)]
-            # Process workflow
-            for result in self.workflow(data):
-                if not self.documents:
-                    st.write(result)
-            # Build embeddings index
-            if self.documents:
-                # Cache data
-                self.data = list(self.documents)
-                with st.spinner("Building embedding index...."):
-                    self.embeddings.index(self.documents)
-                    self.documents.close()
-                # Clear workflow
-                self.documents, self.pipelines, self.workflow = None, None, None
-            # Generate workflow run id
-            self.runid = str(uuid.uuid1())
-            st.session_state["runid"] = self.runid
-        if self.runid != self.state("runid"):
-            st.error("Workflow data changed in another session. Please re-build and re-run workflow.")
-        elif self.embeddings and self.data:
-            default = self.appsetting(workflow, "query")
-            default = default if default else ""
-            # Set query and limit
-            query = st.text_input("Query", value=default)
-            limit = min(5, len(self.data))
-            # Save query state
-            st.session_state["query"] = query
-            st.markdown(
-                """
-            <style>
-            table td:nth-child(1) {
-                display: none
-            }
-            table th:nth-child(1) {
-                display: none
-            }
-            table {text-align: left !important}
-            </style>
-            """,
-                unsafe_allow_html=True,
-            )
-            if query:
-                df = pd.DataFrame([{"content": self.find(uid), "score": "%.2f" % score} for uid, score in self.embeddings.search(query, limit)])
-                st.write(df.to_html(escape=False), unsafe_allow_html=True)
-    def parse(self, data):
         """
-        Parse input data, splits on new lines depending on type of tasks and format of input.
         Args:
-            data: input data
-        Returns:
-            parsed data
         """
-        if re.match(r"^(http|https|file):\/\/", data) or (self.workflow and isinstance(self.workflow.tasks[0], ServiceTask)):
-            return [x for x in data.split("\n") if x]
-        return [data]
     def run(self):
         """
         Runs Streamlit application.
         """
         with st.sidebar:
             st.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
             st.markdown("# Workflow builder  \n*Build and apply workflows to data*  ")
@@ -558,68 +660,32 @@ class Application:
             st.markdown("---")
             # Component configuration
-            labels = {"segmentation": "segment", "textractor": "textract", "translation": "translate"}
             components = ["embeddings", "segmentation", "service", "summary", "tabular", "textractor", "translation"]
             selected, workflow = self.load(components)
-            selected = st.multiselect("Select components", components, default=selected, format_func=lambda text: labels.get(text, text))
-            # Get selected options
-            components = [self.options(component, workflow) for component in selected]
-            st.markdown("---")
-            # Export buttons
-            col1, col2 = st.columns(2)
-            # Build or re-build workflow when build button clicked or new workflow loaded
-            build = col1.button("Build", help="Build the workflow and run within this application")
-            if build or (workflow and workflow != self.state("workflow")):
-                with st.spinner("Building workflow...."):
-                    self.build(components)
-            # Generate API configuration
-            _, config = self.yaml(components)
-            col2.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
-        with st.expander("Data", expanded=not self.data):
-            default = self.appsetting(workflow, "data")
-            default = default if default else ""
-            data = st.text_area("Input", height=10, value=default)
-            # Save data and workflow state
-            st.session_state["data"] = data
-            st.session_state["workflow"] = workflow
         if selected:
-            # Parse text items
-            data = self.parse(data) if data else data
             # Process current action
-            self.process(data, workflow)
-@st.cache(allow_output_mutation=True)
-def create():
-    """
-    Creates and caches a Streamlit application.
-    Returns:
-        Application
-    """
-    return Application("workflows")
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     try:
         nltk.sent_tokenize("This is a test. Split")
     except:
         nltk.download("punkt")
     # Create and run application
-    app = create()
     app.run()

 """
 import os
 import nltk
 import yaml
 from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
+class Process:
     """
+    Container for an active Workflow process instance.
     """
+    @staticmethod
+    @st.cache(ttl=30 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
+    def get(components):
         """
+        Lookup or creates a new workflow process instance.
+        Args:
+            components: input components
+        Returns:
+            Process
         """
+        process = Process()
+        # Build workflow
+        with st.spinner("Building workflow...."):
+            process.build(components)
+        return process
+    def __init__(self):
+        """
+        Creates a new Process.
+        """
         # Component options
         self.components = {}
         self.documents = None
         self.data = None
+    def build(self, components):
+        """
+        Builds a workflow using components.
+        Args:
+            components: list of components to add to workflow
+        """
+        # pylint: disable=W0108
+        tasks = []
+        for component in components:
+            component = dict(component)
+            wtype = component.pop("type")
+            self.components[wtype] = component
+            if wtype == "embeddings":
+                self.embeddings = Embeddings({**component})
+                self.documents = Documents()
+                tasks.append(Task(self.documents.add, unpack=False))
+            elif wtype == "segmentation":
+                self.pipelines[wtype] = Segmentation(**self.components[wtype])
+                tasks.append(Task(self.pipelines[wtype]))
+            elif wtype == "service":
+                tasks.append(ServiceTask(**self.components[wtype]))
+            elif wtype == "summary":
+                self.pipelines[wtype] = Summary(component.pop("path"))
+                tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))
+            elif wtype == "tabular":
+                self.pipelines[wtype] = Tabular(**self.components[wtype])
+                tasks.append(Task(self.pipelines[wtype]))
+            elif wtype == "textractor":
+                self.pipelines[wtype] = Textractor(**self.components[wtype])
+                tasks.append(UrlTask(self.pipelines[wtype]))
+            elif wtype == "translation":
+                self.pipelines[wtype] = Translation()
+                tasks.append(Task(lambda x: self.pipelines["translation"](x, **self.components["translation"])))
+        self.workflow = Workflow(tasks)
+    def run(self, data):
+        """
+        Runs a workflow using data as input.
+        Args:
+            data: input data
+        """
+        if data and self.workflow:
+            # Build tuples for embedding index
+            if self.documents:
+                data = [(x, element, None) for x, element in enumerate(data)]
+            # Process workflow
+            for result in self.workflow(data):
+                if not self.documents:
+                    st.write(result)
+            # Build embeddings index
+            if self.documents:
+                # Cache data
+                self.data = list(self.documents)
+                with st.spinner("Building embedding index...."):
+                    self.embeddings.index(self.documents)
+                    self.documents.close()
+                # Clear workflow
+                self.documents, self.pipelines, self.workflow = None, None, None
+    def search(self, query):
+        """
+        Runs a search.
+        Args:
+            query: input query
+        """
+        if self.embeddings and query:
+            st.markdown(
+                """
+            <style>
+            table td:nth-child(1) {
+                display: none
+            }
+            table th:nth-child(1) {
+                display: none
+            }
+            table {text-align: left !important}
+            </style>
+            """,
+                unsafe_allow_html=True,
+            )
+            limit = min(5, len(self.data))
+            results = []
+            for result in self.embeddings.search(query, limit):
+                # Tuples are returned when an index doesn't have stored content
+                if isinstance(result, tuple):
+                    uid, score = result
+                    results.append({"text": self.find(uid), "score": f"{score:.2}"})
+                else:
+                    if "id" in result and "text" in result:
+                        result["text"] = self.content(result.pop("id"), result["text"])
+                    if "score" in result and result["score"]:
+                        result["score"] = f'{result["score"]:.2}'
+                    results.append(result)
+            df = pd.DataFrame(results)
+            st.write(df.to_html(escape=False), unsafe_allow_html=True)
+    def find(self, key):
+        """
+        Lookup record from cached data by uid key.
+        Args:
+            key: id to search for
+        Returns:
+            text for matching id
+        """
+        # Lookup text by id
+        text = [text for uid, text, _ in self.data if uid == key][0]
+        return self.content(key, text)
+    def content(self, uid, text):
+        """
+        Builds a content reference for uid and text.
+        Args:
+            uid: record id
+            text: record text
+        Returns:
+            content
+        """
+        if uid and uid.lower().startswith("http"):
+            return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
+        return text
+class Application:
+    """
+    Main application.
+    """
+    def __init__(self, directory):
+        """
+        Creates a new application.
+        """
+        # Workflow configuration directory
+        self.directory = directory
     def default(self, names):
         """
         Gets default workflow index.
         Args:
             names: list of workflow names
         Returns:
            default workflow index
         """
         # Get names as lowercase to match case-insensitive
         lnames = [name.lower() for name in names]
         # Get default workflow param
         params = st.experimental_get_query_params()
         index = params.get("default")
         index = index[0].lower() if index else 0
         # Lookup index of workflow name, add 1 to account for "--"
         if index and index in lnames:
             return lnames.index(index) + 1
         # Workflow not found, default to index 0
         return 0
     def load(self, components):
         """
+        Load an existing workflow file.
         Args:
             components: list of components to load
             (names of components loaded, workflow config)
         """
+        with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
             config = yaml.safe_load(f)
         names = [row["name"] for row in config]
         selected = st.selectbox("Load workflow", ["--"] + names, self.default(names))
         if selected != "--":
             index = [x for x, name in enumerate(names) if name == selected][0]
+            with open(os.path.join(self.directory, files[index]), encoding="utf-8") as f:
                 workflow = yaml.safe_load(f)
             st.markdown("---")
         return config.get(name, default) if config else default
+    def text(self, label, component, config, name, default=None):
         """
         Create a new text input field.
         Args:
             label: field label
+            component: component name
             config: component configuration
             name: setting name
             default: default setting value
         elif isinstance(default, dict):
             default = ",".join(default.keys())
+        return st.text_input(label, value=default, key=component + name, disabled=True)
+    def number(self, label, component, config, name, default=None):
         """
         Creates a new numeric input field.
         Args:
             label: field label
+            component: component name
             config: component configuration
             name: setting name
             default: default setting value
             numeric value
         """
+        value = self.text(label, component, config, name, default)
         return int(value) if value else None
+    def boolean(self, label, component, config, name, default=False):
         """
         Creates a new checkbox field.
         Args:
             label: field label
+            component: component name
             config: component configuration
             name: setting name
             default: default setting value
         """
         default = self.setting(config, name, default)
+        return st.checkbox(label, value=default, key=component + name, disabled=True)
+    def select(self, label, component, config, name, options, default=0):
         """
         Creates a new select box field.
         Args:
             label: field label
+            component: component name
             config: component configuration
             name: setting name
             options: list of dropdown options
         # Derive default index
         default = index[0] if index else default
+        return st.selectbox(label, options, index=default, key=component + name, disabled=True)
     def split(self, text):
         """
         # pylint: disable=R0912, R0915
         options = {"type": component}
         # Lookup component configuration
         #   - Runtime components have config defined within tasks
         #   - Pipeline components have config defined at workflow root
         if component == "embeddings":
             st.markdown("**Embeddings Index**  \n*Index workflow output*")
+            options["path"] = self.text("Embeddings model path", component, config, "path", "sentence-transformers/nli-mpnet-base-v2")
+            options["upsert"] = self.boolean("Upsert", component, config, "upsert")
+            options["content"] = self.boolean("Content", component, config, "content")
         elif component in ("segmentation", "textractor"):
             if component == "segmentation":
             else:
                 st.markdown("**Textract**  \n*Extract text from documents*")
+            options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
+            options["lines"] = self.boolean("Split lines", component, config, "lines")
+            options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
+            options["join"] = self.boolean("Join tokenized", component, config, "join")
+            options["minlength"] = self.number("Min section length", component, config, "minlength")
         elif component == "service":
             st.markdown("**Service**  \n*Extract data from an API*")
+            options["url"] = self.text("URL", component, config, "url")
+            options["method"] = self.select("Method", component, config, "method", ["get", "post"], 0)
+            options["params"] = self.text("URL parameters", component, config, "params")
+            options["batch"] = self.boolean("Run as batch", component, config, "batch", True)
+            options["extract"] = self.text("Subsection(s) to extract", component, config, "extract")
             if options["params"]:
                 options["params"] = {key: None for key in self.split(options["params"])}
         elif component == "summary":
             st.markdown("**Summary**  \n*Abstractive text summarization*")
+            options["path"] = self.text("Model", component, config, "path", "sshleifer/distilbart-cnn-12-6")
+            options["minlength"] = self.number("Min length", component, config, "minlength")
+            options["maxlength"] = self.number("Max length", component, config, "maxlength")
         elif component == "tabular":
             st.markdown("**Tabular**  \n*Split tabular data into rows and columns*")
+            options["idcolumn"] = self.text("Id columns", component, config, "idcolumn")
+            options["textcolumns"] = self.text("Text columns", component, config, "textcolumns")
+            options["content"] = self.text("Content", component, config, "content")
             if options["textcolumns"]:
                 options["textcolumns"] = self.split(options["textcolumns"])
+            if options["content"]:
+                options["content"] = self.split(options["content"])
+                if len(options["content"]) == 1 and options["content"][0] == "1":
+                    options["content"] = options["content"][0]
         elif component == "translation":
             st.markdown("**Translate**  \n*Machine translation*")
+            options["target"] = self.text("Target language code", component, config, "args", "en")
         return options
     def yaml(self, components):
         """
         Builds a yaml string for components.
             (workflow name, YAML string)
         """
         data = {"app": {"data": self.state("data"), "query": self.state("query")}}
         tasks = []
         name = None
         return (name, yaml.dump(data))
+    def data(self, workflow):
         """
+        Gets input data.
         Args:
+            workflow: workflow configuration
         Returns:
+            input data
         """
+        # Get default data setting
+        data = self.appsetting(workflow, "data")
+        if not self.appsetting(workflow, "query"):
+            data = st.text_input("Input", value=data)
+        # Save data state
+        st.session_state["data"] = data
+        # Wrap data as list for workflow processing
+        return [data]
+    def query(self, workflow):
         """
+        Gets input query.
         Args:
             workflow: workflow configuration
+        Returns:
+            input query
+        """
+        default = self.appsetting(workflow, "query")
+        default = default if default else ""
+        # Set query and limit
+        query = st.text_input("Query", value=default)
+        # Save query state
+        st.session_state["query"] = query
+        return query
+    def process(self, workflow, components):
         """
+        Processes the current application action.
         Args:
+            workflow: workflow configuration
+            components: workflow components
         """
+        # Get workflow process
+        process = Process.get(components)
+        # Run workflow process
+        process.run(self.data(workflow))
+        # Run search
+        if process.embeddings:
+            process.search(self.query(workflow))
     def run(self):
         """
         Runs Streamlit application.
         """
+        # Load configuration
         with st.sidebar:
             st.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
             st.markdown("# Workflow builder  \n*Build and apply workflows to data*  ")
             st.markdown("---")
             # Component configuration
             components = ["embeddings", "segmentation", "service", "summary", "tabular", "textractor", "translation"]
             selected, workflow = self.load(components)
+            if selected:
+                # Get selected options
+                components = [self.options(component, workflow) for component in selected]
         if selected:
             # Process current action
+            self.process(workflow, components)
+            with st.sidebar:
+                # Generate export button after workflow is complete
+                _, config = self.yaml(components)
+                st.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # pylint: disable=W0702
     try:
         nltk.sent_tokenize("This is a test. Split")
     except:
         nltk.download("punkt")
     # Create and run application
+    app = Application("workflows")
     app.run()