Spaces:

SEA-AI
/

SEA-dog

Configuration error

App Files Files Community

kevinconka commited on Feb 20, 2024

Commit

273f5f9

1 Parent(s): a5d5a84

Save query + result in dataset

Browse files

Files changed (4) hide show

.gitignore +163 -1
app.py +35 -39
chatbot.py +30 -0
flagging.py +79 -0

.gitignore CHANGED Viewed

	@@ -1 +1,163 @@
1	- *.html

+*.html
+flagged/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py CHANGED Viewed

@@ -1,53 +1,41 @@
 import urllib.request
-from langchain.chains import RetrievalQA
-from langchain_community.document_loaders import UnstructuredHTMLLoader
-from langchain_openai import OpenAIEmbeddings
-from langchain_openai import ChatOpenAI
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.vectorstores import Chroma
-import gradio as gr
 # get the html data and save it to a file
 url = "https://sea.ai/faq"
-html = urllib.request.urlopen(url).read()
-with open("FAQ_SEA.AI.html", "wb") as f:
-    f.write(html)
-# load documents
-loader = UnstructuredHTMLLoader("FAQ_SEA.AI.html")
-documents = loader.load()
-# split the documents into chunks
-text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-texts = text_splitter.split_documents(documents)
-# select which embeddings we want to use
-embeddings = OpenAIEmbeddings()
-# create the vectorestore to use as the index
-db = Chroma.from_documents(texts, embeddings)
-# expose this index in a retriever interface
-retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
-# create a chain to answer questions
-qa = RetrievalQA.from_chain_type(
-    llm=ChatOpenAI(),
-    chain_type="stuff",
-    retriever=retriever,
-    return_source_documents=True,
-    verbose=True,
-)
 def answer_question(message, history, system):
-    # unwind the history of last 2 messages
-    history = " ".join(f"{user} {bot}" for user, bot in history[-2:])
     # concatenate the history, message and system
-    query = " ".join([history, message, system])
     retrieval_qa = qa.invoke(query)
     result = retrieval_qa["result"]
     result = result.replace('"', "").strip()  # clean up the result
     # query = retrieval_qa["query"]
     # source_documents = retrieval_qa["source_documents"]
     return result
@@ -56,8 +44,11 @@ description = """
 <p align="center">
 I have memorized the entire SEA.AI FAQ page. Ask me anything about it! 🧠
 <br>
-You can modify my response by using the <code>SYSTEM</code> input under
-<code>Additional Inputs</code>.
 </p>
 """
@@ -70,7 +61,7 @@ h1 {
 theme = gr.themes.Default(primary_hue=gr.themes.colors.indigo)
-demo = gr.ChatInterface(
     answer_question,
     title=title,
     description=description,
@@ -81,7 +72,12 @@ demo = gr.ChatInterface(
     ],
     css=css,
     theme=theme,
-)
 if __name__ == "__main__":
     demo.launch()

 import urllib.request
+import gradio as gr
+from huggingface_hub import get_token
+from chatbot import get_retrieval_qa
+from flagging import myHuggingFaceDatasetSaver
 # get the html data and save it to a file
+def download_html(_url: str, _filename: str):
+    html = urllib.request.urlopen(_url).read()
+    with open(_filename, "wb") as f:
+        f.write(html)
 url = "https://sea.ai/faq"
+filename = "FAQ_SEA.AI.html"
+download_html(url, filename)
+# load the retrieval QA model
+qa = get_retrieval_qa(filename)
+# dataset callback
+dataset_name = "SEA-AI/seadog-chat-history"
+hf_writer = myHuggingFaceDatasetSaver(get_token(), dataset_name)
 def answer_question(message, history, system):
     # concatenate the history, message and system
+    query = " ".join([message, system])
     retrieval_qa = qa.invoke(query)
     result = retrieval_qa["result"]
     result = result.replace('"', "").strip()  # clean up the result
     # query = retrieval_qa["query"]
     # source_documents = retrieval_qa["source_documents"]
+    # save the query and result to the dataset
+    hf_writer.flag([query, result])
     return result
 <p align="center">
 I have memorized the entire SEA.AI FAQ page. Ask me anything about it! 🧠
 <br>
+I can't remember conversations yet, be patient with me.
+<br>
+DISCLAIMER: Your queries will be saved to
+<a href='https://huggingface.co/datasets/SEA-AI/seadog-chat-history'>this dataset</a>
+for analytics purposes.
 </p>
 """
 theme = gr.themes.Default(primary_hue=gr.themes.colors.indigo)
+with gr.ChatInterface(
     answer_question,
     title=title,
     description=description,
     ],
     css=css,
     theme=theme,
+) as demo:
+    # on page load, download the html and save it to a file
+    demo.load(lambda: download_html(url, filename))
+    # This needs to be called prior to the first call to callback.flag()
+    hf_writer.setup([demo.textbox, demo.chatbot], "flagged")
 if __name__ == "__main__":
     demo.launch()

chatbot.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langchain.chains import RetrievalQA
+from langchain_community.document_loaders import UnstructuredHTMLLoader
+from langchain_openai import OpenAIEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+def get_retrieval_qa(filename):
+    # load documents
+    loader = UnstructuredHTMLLoader(filename)
+    documents = loader.load()
+    # split the documents into chunks
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    texts = text_splitter.split_documents(documents)
+    # select which embeddings we want to use
+    embeddings = OpenAIEmbeddings()
+    # create the vectorestore to use as the index
+    db = Chroma.from_documents(texts, embeddings)
+    # expose this index in a retriever interface
+    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
+    # create a chain to answer questions
+    return RetrievalQA.from_chain_type(
+        llm=ChatOpenAI(),
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True,
+        verbose=True,
+    )

flagging.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from collections import OrderedDict
+from pathlib import Path
+from typing import Any
+import gradio as gr
+from gradio.flagging import HuggingFaceDatasetSaver, client_utils
+import huggingface_hub
+class myHuggingFaceDatasetSaver(HuggingFaceDatasetSaver):
+    """
+    Custom HuggingFaceDatasetSaver to save images/audio to disk.
+    Gradio's implementation seems to have a bug.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _deserialize_components(
+        self,
+        data_dir: Path,
+        flag_data: list[Any],
+        flag_option: str = "",
+        username: str = "",
+    ) -> tuple[dict[Any, Any], list[Any]]:
+        """Deserialize components and return the corresponding row for the flagged sample.
+        Images/audio are saved to disk as individual files.
+        """
+        # Components that can have a preview on dataset repos
+        file_preview_types = {gr.Audio: "Audio", gr.Image: "Image"}
+        # Generate the row corresponding to the flagged sample
+        features = OrderedDict()
+        row = []
+        for component, sample in zip(self.components, flag_data):
+            # Get deserialized object (will save sample to disk if applicable -file, audio, image,...-)
+            label = component.label or ""
+            save_dir = data_dir / client_utils.strip_invalid_filename_characters(label)
+            save_dir.mkdir(exist_ok=True, parents=True)
+            if isinstance(component, gr.Chatbot):
+                deserialized = sample  # dirty fix
+            else:
+                deserialized = component.flag(sample, save_dir)
+            # Add deserialized object to row
+            features[label] = {"dtype": "string", "_type": "Value"}
+            try:
+                assert Path(deserialized).exists()
+                row.append(str(Path(deserialized).relative_to(self.dataset_dir)))
+            except (AssertionError, TypeError, ValueError, OSError):
+                deserialized = "" if deserialized is None else str(deserialized)
+                row.append(deserialized)
+            # If component is eligible for a preview, add the URL of the file
+            # Be mindful that images and audio can be None
+            if isinstance(component, tuple(file_preview_types)):  # type: ignore
+                for _component, _type in file_preview_types.items():
+                    if isinstance(component, _component):
+                        features[label + " file"] = {"_type": _type}
+                        break
+                if deserialized:
+                    path_in_repo = str(
+                        # returned filepath is absolute, we want it relative to compute URL
+                        Path(deserialized).relative_to(self.dataset_dir)
+                    ).replace("\\", "/")
+                    row.append(
+                        huggingface_hub.hf_hub_url(
+                            repo_id=self.dataset_id,
+                            filename=path_in_repo,
+                            repo_type="dataset",
+                        )
+                    )
+                else:
+                    row.append("")
+        features["flag"] = {"dtype": "string", "_type": "Value"}
+        features["username"] = {"dtype": "string", "_type": "Value"}
+        row.append(flag_option)
+        row.append(username)
+        return features, row