Spaces:

activebus
/

MetaCLIP

Runtime error

Hu Xu commited on Dec 26, 2023

Commit

b94cb82

1 Parent(s): 51d9840

Add application file

Files changed (7) hide show

app.py ADDED Viewed

+import gradio as gr
+entry_count = None
+metadata = None
+def init_demo():
+    import json
+    import numpy as np
+    global metadata
+    with open("metadata.json") as f:
+        metadata = json.load(f)
+    # entry counts for our 1.6B(pool) -> 400M(curated); please check balance_sampling:main and substr match and count on your own data.
+    with open("metaclip/entry_counts_400m.json") as f:
+        entry_count_json = json.load(f)
+    global entry_count
+    entry_count = np.array([entry_count_json[entry] for entry in metadata], dtype=np.uint64)  # uint64 to be safe for scaling.
+def curation(text):
+    import sys
+    sys.path.append("./")
+    from metaclip.substr_matching import substr_matching
+    from metaclip.balancing import balance_sampling
+    t = 20000  # TODO: make this part of the UI
+    entry_count[entry_count < t] = t
+    entry_prob = t / entry_count
+    matched_entry_ids = substr_matching(text, metadata)
+    curation_prob = min(entry_prob[matched_entry_ids].sum(), 1.0)
+    curated = balance_sampling(matched_entry_ids, entry_prob)
+    return f"curation_prob={curation_prob:.3f}, curated={curated}"
+init_demo()
+demo = gr.Interface(fn=curation, inputs="text", outputs="text")
+if __name__ == "__main__":
+    demo.launch(show_api=False)

metaclip/__pycache__/balancing.cpython-310.pyc ADDED Viewed

Binary file (445 Bytes). View file

metaclip/__pycache__/substr_matching.cpython-310.pyc ADDED Viewed

Binary file (798 Bytes). View file

metaclip/balancing.py ADDED Viewed

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import json
+import numpy as np
+import os
+import random
+from tqdm import tqdm
+def balance_sampling(matched_entry_ids, entry_prob):
+    # this can be placed in a pipeline or on-the-fly in a data loader.
+    # see a numpy impl. at metaclip.indexing.balance_sampling.balance_sampling
+    for entry_id in matched_entry_ids:
+        if random.random() < entry_prob[entry_id]:
+            return True
+    return False

metaclip/entry_counts_400m.json ADDED Viewed

The diff for this file is too large to render. See raw diff

metaclip/substr_matching.py ADDED Viewed

+# Copyright (c) Meta Platforms, Inc. and affiliates
+spaced_metadata = None
+def spacing(text):
+    puncts_to_wrap = [",", ".", ";", ":", "?", "!", "`"]
+    chars_to_space = ["\t", "\n", "\r"]
+    spaced_text = f" {text} "
+    for punct_to_wrap in puncts_to_wrap:
+        spaced_text = spaced_text.replace(
+            punct_to_wrap, f" {punct_to_wrap} "
+        )
+    for char_to_space in chars_to_space:
+        spaced_text = spaced_text.replace(char_to_space, " ")
+    return spaced_text
+def substr_matching(text, metadata):
+    global spaced_metadata
+    if spaced_metadata is None:
+        spaced_metadata = []
+        for entry in metadata:
+            spaced_metadata.append(f" {entry} ")
+    text = spacing(text)
+    matched_entry_ids = []
+    for entry_id, entry in enumerate(spaced_metadata):
+        if entry in text:
+            matched_entry_ids.append(entry_id)
+    return matched_entry_ids

metadata.json ADDED Viewed

The diff for this file is too large to render. See raw diff