Spaces:

douwekiela
/

dadc

Runtime error

App Files Files Community

Tristan Thrush commited on Jul 18, 2022

Commit

829775d

1 Parent(s): e3e024d

bugfix

Browse files

Files changed (4) hide show

README.md +16 -9
app.py +13 -6
collect.py +21 -17
requirements.txt +6 -7

README.md CHANGED Viewed

@@ -20,8 +20,10 @@ A basic example of dynamic adversarial data collection with a Gradio app.
    name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
    the url of this dataset in the secrets for your space, with the name
    `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
-   space on mturk in the following lines, the app will use your token to
-   automatically store new hits to your dataset.
 **Running Data Collection**
 1. On your local repo that you pulled, create a copy of `config.py.example`,
@@ -29,15 +31,20 @@ A basic example of dynamic adversarial data collection with a Gradio app.
    These keys should be for an AWS account that has the
    AmazonMechanicalTurkFullAccess permission. You also need to
    create an mturk requestor account associated with your AWS account.
-2. Run `python collect.py` locally. If you run it with the `--live_mode` flag,
-   it launches HITs on mturk, using the app you deployed on the space as the
-   data collection UI and backend. NOTE: this means that you will need to pay
-   real workers. If you don't use the `--live_mode` flag, then it will run the
-   HITs on mturk sandbox, which is identical to the normal mturk, but just for
-   testing. You can create a worker account and go to the sandbox version to
-   test your HIT.
 **Profit**
 Now, you should be watching hits come into your Hugging Face dataset
 automatically!

    name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
    the url of this dataset in the secrets for your space, with the name
    `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
+   space on mturk and when people visit your space on huggingface.co, the app
+   will use your token to automatically store new HITs in your dataset. NOTE:
+   if you push something to your dataset manually, you need to restart your space
+   or it could get merge conflicts when trying to push HIT data.
 **Running Data Collection**
 1. On your local repo that you pulled, create a copy of `config.py.example`,
    These keys should be for an AWS account that has the
    AmazonMechanicalTurkFullAccess permission. You also need to
    create an mturk requestor account associated with your AWS account.
+2. Run `python collect.py` locally.
 **Profit**
 Now, you should be watching hits come into your Hugging Face dataset
 automatically!
+**Tips and Tricks**
+- If you are developing and running this space locally to test it out, try
+deleting the data directory that the app clones before running the app again.
+Otherwise, the app could get merge conflicts when storing new HITs on the hub.
+When you redeploy your app on Hugging Face spaces, the data directory is deleted
+automatically.
+- huggingface spaces have limited computational resources and memory. If you
+run too many HITs and/or assignments at once, then you could encounter issues.
+You could also encounter issues if you are trying to create a dataset that is
+very large. Check the log of your space for any errors that could be happening.

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from huggingface_hub import Repository
 from dotenv import load_dotenv
 from pathlib import Path
 import json
 # These variables are for storing the mturk HITs in a Hugging Face dataset.
 if Path(".env").is_file():
@@ -92,11 +93,16 @@ with demo:
     # This _store_in_huggingface_dataset function just demonstrates how easy it is
     # to automatically create a Hugging Face dataset from mturk.
     def _store_in_huggingface_dataset(state):
-        with open(DATA_FILE, "a") as jsonlfile:
-            json_data_with_assignment_id =\
-                [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
-            jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
-        repo.push_to_hub()
         return state
     # Button event handlers
@@ -130,7 +136,7 @@ with demo:
                 };
                 document.body.appendChild(form);
                 form.submit();
-                return [state];
             } else {
                 // If there is no assignmentId, then we assume that the submitter is
                 // on huggingface.co and we can't submit a HIT to mturk. But
@@ -138,6 +144,7 @@ with demo:
                 // our dataset without an assignmentId. The following line here
                 // loads the app again so the user can enter in another "fake" HIT.
                 window.location.href = window.location.href;
             }
         }
         """

 from dotenv import load_dotenv
 from pathlib import Path
 import json
+from filelock import FileLock
 # These variables are for storing the mturk HITs in a Hugging Face dataset.
 if Path(".env").is_file():
     # This _store_in_huggingface_dataset function just demonstrates how easy it is
     # to automatically create a Hugging Face dataset from mturk.
     def _store_in_huggingface_dataset(state):
+        lock = FileLock(DATA_FILE + ".lock")
+        lock.acquire()
+        try:
+            with open(DATA_FILE, "a") as jsonlfile:
+                json_data_with_assignment_id =\
+                    [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
+                jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
+            repo.push_to_hub()
+        finally:
+            lock.release()
         return state
     # Button event handlers
                 };
                 document.body.appendChild(form);
                 form.submit();
+                return state;
             } else {
                 // If there is no assignmentId, then we assume that the submitter is
                 // on huggingface.co and we can't submit a HIT to mturk. But
                 // our dataset without an assignmentId. The following line here
                 // loads the app again so the user can enter in another "fake" HIT.
                 window.location.href = window.location.href;
+                return state;
             }
         }
         """

collect.py CHANGED Viewed

@@ -10,7 +10,8 @@ import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
 parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
-parser.add_argument("--num_assignments", type=int, default=5, help="The number of times that the HIT can be accepted and completed.")
 parser.add_argument("--live_mode", action="store_true", help="""
     Whether to run in live mode with real turkers. This will charge your account money.
     If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
@@ -35,19 +36,22 @@ question = ExternalQuestion(f"https://hf.space/embed/{args.space_name}/+?__theme
     frame_height=600
 )
-new_hit = mturk.create_hit(
-    Title="DADC with Gradio",
-    Description="Hello",
-    Keywords="fool the model",
-    Reward="0.15",
-    MaxAssignments=args.num_assignments,
-    LifetimeInSeconds=172800,
-    AssignmentDurationInSeconds=600,
-    AutoApprovalDelayInSeconds=14400,
-    Question=question.get_as_xml(),
-)
-print(
-    f"Link: https://worker{'' if args.live_mode else 'sandbox'}.mturk.com/mturk/preview?groupId="
-    + new_hit["HIT"]["HITGroupId"]
-)

 parser = argparse.ArgumentParser()
 parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
 parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
+parser.add_argument("--num_hits", type=int, default=5, help="The number of HITs.")
+parser.add_argument("--num_assignments", type=int, default=1, help="The number of times that the HIT can be accepted and completed.")
 parser.add_argument("--live_mode", action="store_true", help="""
     Whether to run in live mode with real turkers. This will charge your account money.
     If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
     frame_height=600
 )
+for i in range(args.num_hits):
+    new_hit = mturk.create_hit(
+        Title="Beat the AI",
+        Description="Try to fool an AI by creating examples that it gets wrong",
+        Keywords="fool the model",
+        Reward="0.15",
+        MaxAssignments=args.num_assignments,
+        LifetimeInSeconds=172800,
+        AssignmentDurationInSeconds=600,
+        AutoApprovalDelayInSeconds=14400,
+        Question=question.get_as_xml(),
+    )
+    print(
+        f"HIT #{i} Link: https://worker{'' if args.live_mode else 'sandbox'}.mturk.com/mturk/preview?groupId="
+        + new_hit["HIT"]["HITGroupId"]
+    )
+    new_hits.append(new_hit)

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
-requests
-torch
-transformers
-gradio
-boto3
-huggingface_hub
-python-dotenv

+torch==1.12.0
+transformers==4.20.1
+gradio==3.0.26
+boto3==1.24.32
+huggingface_hub==0.8.1
+python-dotenv==0.20.0