Spaces:

douwekiela
/

dadc

Runtime error

App Files Files Community

Tristan Thrush commited on Jul 16, 2022

Commit

bce177f

1 Parent(s): e91bd7c

added hit-to-huggingface dataset code. cleaned everything up

Browse files

Files changed (4) hide show

README.md +30 -0
app.py +52 -20
collect.py +20 -9
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -11,3 +11,33 @@ license: bigscience-bloom-rail-1.0
 ---
 A basic example of dynamic adversarial data collection with a Gradio app.

 ---
 A basic example of dynamic adversarial data collection with a Gradio app.
+*Instructions for someone to use for their own project:*
+**Setting up the Space**
+1. Clone this repo and deploy it on your own Hugging Face space.
+2. Add one of your Hugging Face tokens to the secrets for your space, with the
+   name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
+   the url of this dataset in the secrets for your space, with the name
+   `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
+   space on mturk in the following lines, the app will use your token to
+   automatically store new hits to your dataset.
+**Running Data Collection**
+1. On your local repo that you pulled, create a copy of `config.py.example`,
+   just called `config.py`. Now, put keys from your AWS account in `config.py`.
+   These keys should be for an AWS account that has the
+   AmazonMechanicalTurkFullAccess permission. You also need to
+   create an mturk requestor account associated with your AWS account.
+2. Run `python collect.py` locally. If you run it with the `--live_mode` flag,
+   it launches HITs on mturk, using the app you deployed on the space as the
+   data collection UI and backend. NOTE: this means that you will need to pay
+   real workers. If you don't use the `--live_mode` flag, then it will run the
+   HITs on mturk sandbox, which is identical to the normal mturk, but just for
+   testing. You can create a worker account and go to the sandbox version to
+   test your HIT.
+**Profit**
+Now, you should be watching hits come into your Hugging Face dataset
+automatically!

app.py CHANGED Viewed

@@ -1,13 +1,24 @@
 # Basic example for doing model-in-the-loop dynamic adversarial data collection
 # using Gradio Blocks.
 import random
 from urllib.parse import parse_qs
 import gradio as gr
 import requests
 from transformers import pipeline
 pipe = pipeline("sentiment-analysis")
 demo = gr.Blocks()
@@ -16,7 +27,7 @@ with demo:
     total_cnt = 2 # How many examples per HIT
     dummy = gr.Textbox(visible=False)  # dummy for passing assignmentId
-    # We keep track of state as a Variable
     state_dict = {"assignmentId": "", "cnt": 0, "fooled": 0, "data": [], "metadata": {}}
     state = gr.JSON(state_dict, visible=False)
@@ -47,6 +58,9 @@ with demo:
         toggle_example_submit = gr.update(visible=not done)
         new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['fooled']} fooled)"
         query = parse_qs(dummy[1:])
         state["assignmentId"] = query["assignmentId"][0]
@@ -64,33 +78,51 @@ with demo:
     with gr.Column(visible=False) as final_submit:
         submit_hit_button = gr.Button("Submit HIT")
-    # Submit state to MTurk backend for ExternalQuestion
-    # Update the URL below to switch from Sandbox to real data collection
-    def _submit(state, dummy):
-        query = parse_qs(dummy[1:])
-        assert "assignmentId" in query, "No assignment ID provided, unable to submit"
-        state["assignmentId"] = query["assignmentId"][0]
-        url = f"https://workersandbox.mturk.com/mturk/externalSubmit?assignmentId={state['assignmentId']}&colorChoice=blue"
-        x = requests.post(url)
-        return str(x) + " With assignmentId " + state["assignmentId"] + "\n" + x.text, state, dummy
     # Button event handlers
     submit_ex_button.click(
         _predict,
         inputs=[text_input, label_input, state, dummy],
         outputs=[label_output, text_output, state, example_submit, final_submit, state_display, dummy],
-        _js="function(text_input, label_input, state, dummy) { console.log(text_input); console.log(label_input); console.log(state); console.log(dummy); return [text_input, label_input, state, window.location.search]; }",
     )
-    def _something(state):
-        print(state)
-        return state
     submit_hit_button.click(
-        _something,
         inputs=[state],
-        outputs=[state],
-        _js="function(state) { console.log(state); const form = document.createElement('form'); form.action='https://workersandbox.mturk.com/mturk/externalSubmit'; form.method='post'; for (const key in state) {const hiddenField = document.createElement('input'); hiddenField.type = 'hidden'; hiddenField.name = key; hiddenField.value = state[key]; form.appendChild(hiddenField)}; document.body.appendChild(form); console.log(state); console.log(form); form.submit(); return [state];}",
     )
-demo.launch(share=True)

 # Basic example for doing model-in-the-loop dynamic adversarial data collection
 # using Gradio Blocks.
+import os
 import random
 from urllib.parse import parse_qs
 import gradio as gr
 import requests
 from transformers import pipeline
+from huggingface_hub import Repository
+# These variables are for storing the mturk HITs in a Hugging Face dataset.
+DATA_FILENAME = "data.jsonl"
+DATA_FILE = os.path.join("data", DATA_FILENAME)
+DATASET_REPO_URL = os.environ.get(DATASET_REPO_URL)
+HF_TOKEN = os.environ.get("HF_TOKEN")
+repo = Repository(
+    local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
+)
+# Now let's run the app!
 pipe = pipeline("sentiment-analysis")
 demo = gr.Blocks()
     total_cnt = 2 # How many examples per HIT
     dummy = gr.Textbox(visible=False)  # dummy for passing assignmentId
+    # We keep track of state as a JSON
     state_dict = {"assignmentId": "", "cnt": 0, "fooled": 0, "data": [], "metadata": {}}
     state = gr.JSON(state_dict, visible=False)
         toggle_example_submit = gr.update(visible=not done)
         new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['fooled']} fooled)"
+        # We need to store the assignmentId in the state before submit_hit_button
+        # is clicked. We can do this here in _predict, which is called before
+        # submit_hit_button is clicked
         query = parse_qs(dummy[1:])
         state["assignmentId"] = query["assignmentId"][0]
     with gr.Column(visible=False) as final_submit:
         submit_hit_button = gr.Button("Submit HIT")
+    # Store the HIT data into a Hugging Face dataset.
+    # The HIT is also stored and logged on mturk when post_hit_js is run below.
+    # This _store_in_huggingface_dataset function just demonstrates how easy it is
+    # to automatically create a Hugging Face dataset from mturk.
+    def _store_in_huggingface_dataset(state, dummy):
+        with open(DATA_FILE, "a") as jsonlfile:
+            jsonlfile.write(json.dumps(state))
+        repo.push_to_hub()
     # Button event handlers
+    get_window_location_search_js = """
+        function(text_input, label_input, state, dummy) {
+            return [text_input, label_input, state, window.location.search];
+        }
+        """
     submit_ex_button.click(
         _predict,
         inputs=[text_input, label_input, state, dummy],
         outputs=[label_output, text_output, state, example_submit, final_submit, state_display, dummy],
+        _js=get_window_location_search_js,
     )
+    post_hit_js = """
+        function(state) {
+            const form = document.createElement('form');
+            form.action = 'https://workersandbox.mturk.com/mturk/externalSubmit';
+            form.method = 'post';
+            for (const key in state) {
+                const hiddenField = document.createElement('input');
+                hiddenField.type = 'hidden';
+                hiddenField.name = key;
+                hiddenField.value = state[key];
+                form.appendChild(hiddenField)
+            };
+            document.body.appendChild(form);
+            form.submit();
+        }
+        """
     submit_hit_button.click(
+        _store_in_huggingface_dataset,
         inputs=[state],
+        outputs=None,
+        _js=post_hit_js,
     )
+demo.launch()

collect.py CHANGED Viewed

@@ -5,20 +5,33 @@ import boto3
 from boto.mturk.question import ExternalQuestion
 from config import MTURK_KEY, MTURK_SECRET
-MTURK_REGION = "us-east-1"
-MTURK_SANDBOX = "https://mturk-requester-sandbox.us-east-1.amazonaws.com"
 mturk = boto3.client(
     "mturk",
     aws_access_key_id=MTURK_KEY,
     aws_secret_access_key=MTURK_SECRET,
     region_name=MTURK_REGION,
-    endpoint_url=MTURK_SANDBOX,
 )
-# The + in the URL makes the Space easily embeddable in an iframe
-question = ExternalQuestion("https://hf.space/embed/Tristan/dadc/+?__theme=light",
     frame_height=600
 )
@@ -27,7 +40,7 @@ new_hit = mturk.create_hit(
     Description="Hello",
     Keywords="fool the model",
     Reward="0.15",
-    MaxAssignments=1,
     LifetimeInSeconds=172800,
     AssignmentDurationInSeconds=600,
     AutoApprovalDelayInSeconds=14400,
@@ -35,8 +48,6 @@ new_hit = mturk.create_hit(
 )
 print(
-    "Sandbox link: https://workersandbox.mturk.com/mturk/preview?groupId="
     + new_hit["HIT"]["HITGroupId"]
 )
-print("Hit Id:", new_hit["HIT"]["HITId"])

 from boto.mturk.question import ExternalQuestion
 from config import MTURK_KEY, MTURK_SECRET
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
+parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
+parser.add_argument("--num_assignments", type=int, default=5, help="The number of times that the HIT can be accepted and completed.")
+parser.add_argument("--live_mode", action="store_true", help="""
+    Whether to run in live mode with real turkers. This will charge your account money.
+    If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
+    which will not charge your account money.
+    """
+)
+args = parser.parse_args()
+MTURK_URL = f"https://mturk-requester{"" if args.live_mode else "-sandbox"}.{args.mturk_region}.amazonaws.com"
 mturk = boto3.client(
     "mturk",
     aws_access_key_id=MTURK_KEY,
     aws_secret_access_key=MTURK_SECRET,
     region_name=MTURK_REGION,
+    endpoint_url=MTURK_URL,
 )
+# This is the URL that makes the space embeddable in an mturk iframe
+question = ExternalQuestion(f"https://hf.space/embed/{args.space_name}/+?__theme=light",
     frame_height=600
 )
     Description="Hello",
     Keywords="fool the model",
     Reward="0.15",
+    MaxAssignments=args.num_assignments,
     LifetimeInSeconds=172800,
     AssignmentDurationInSeconds=600,
     AutoApprovalDelayInSeconds=14400,
 )
 print(
+    f"Link: https://worker{"" if args.live_mode else "sandbox"}.mturk.com/mturk/preview?groupId="
     + new_hit["HIT"]["HITGroupId"]
 )

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ requests
 torch
 transformers
 gradio
-boto3

 torch
 transformers
 gradio
+boto3
+huggingface_hub