Hu Xu
commited on
Commit
·
b94cb82
1
Parent(s):
51d9840
Add application file
Browse files- app.py +44 -0
- metaclip/__pycache__/balancing.cpython-310.pyc +0 -0
- metaclip/__pycache__/substr_matching.cpython-310.pyc +0 -0
- metaclip/balancing.py +17 -0
- metaclip/entry_counts_400m.json +0 -0
- metaclip/substr_matching.py +31 -0
- metadata.json +0 -0
app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
entry_count = None
|
| 5 |
+
metadata = None
|
| 6 |
+
|
| 7 |
+
def init_demo():
|
| 8 |
+
import json
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
global metadata
|
| 12 |
+
with open("metadata.json") as f:
|
| 13 |
+
metadata = json.load(f)
|
| 14 |
+
|
| 15 |
+
# entry counts for our 1.6B(pool) -> 400M(curated); please check balance_sampling:main and substr match and count on your own data.
|
| 16 |
+
with open("metaclip/entry_counts_400m.json") as f:
|
| 17 |
+
entry_count_json = json.load(f)
|
| 18 |
+
global entry_count
|
| 19 |
+
entry_count = np.array([entry_count_json[entry] for entry in metadata], dtype=np.uint64) # uint64 to be safe for scaling.
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def curation(text):
|
| 23 |
+
import sys
|
| 24 |
+
sys.path.append("./")
|
| 25 |
+
from metaclip.substr_matching import substr_matching
|
| 26 |
+
from metaclip.balancing import balance_sampling
|
| 27 |
+
|
| 28 |
+
t = 20000 # TODO: make this part of the UI
|
| 29 |
+
entry_count[entry_count < t] = t
|
| 30 |
+
entry_prob = t / entry_count
|
| 31 |
+
|
| 32 |
+
matched_entry_ids = substr_matching(text, metadata)
|
| 33 |
+
curation_prob = min(entry_prob[matched_entry_ids].sum(), 1.0)
|
| 34 |
+
curated = balance_sampling(matched_entry_ids, entry_prob)
|
| 35 |
+
|
| 36 |
+
return f"curation_prob={curation_prob:.3f}, curated={curated}"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
init_demo()
|
| 40 |
+
|
| 41 |
+
demo = gr.Interface(fn=curation, inputs="text", outputs="text")
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
demo.launch(show_api=False)
|
metaclip/__pycache__/balancing.cpython-310.pyc
ADDED
|
Binary file (445 Bytes). View file
|
|
|
metaclip/__pycache__/substr_matching.cpython-310.pyc
ADDED
|
Binary file (798 Bytes). View file
|
|
|
metaclip/balancing.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
import os
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def balance_sampling(matched_entry_ids, entry_prob):
|
| 12 |
+
# this can be placed in a pipeline or on-the-fly in a data loader.
|
| 13 |
+
# see a numpy impl. at metaclip.indexing.balance_sampling.balance_sampling
|
| 14 |
+
for entry_id in matched_entry_ids:
|
| 15 |
+
if random.random() < entry_prob[entry_id]:
|
| 16 |
+
return True
|
| 17 |
+
return False
|
metaclip/entry_counts_400m.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metaclip/substr_matching.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
spaced_metadata = None
|
| 5 |
+
|
| 6 |
+
def spacing(text):
|
| 7 |
+
puncts_to_wrap = [",", ".", ";", ":", "?", "!", "`"]
|
| 8 |
+
chars_to_space = ["\t", "\n", "\r"]
|
| 9 |
+
|
| 10 |
+
spaced_text = f" {text} "
|
| 11 |
+
for punct_to_wrap in puncts_to_wrap:
|
| 12 |
+
spaced_text = spaced_text.replace(
|
| 13 |
+
punct_to_wrap, f" {punct_to_wrap} "
|
| 14 |
+
)
|
| 15 |
+
for char_to_space in chars_to_space:
|
| 16 |
+
spaced_text = spaced_text.replace(char_to_space, " ")
|
| 17 |
+
return spaced_text
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def substr_matching(text, metadata):
|
| 21 |
+
global spaced_metadata
|
| 22 |
+
if spaced_metadata is None:
|
| 23 |
+
spaced_metadata = []
|
| 24 |
+
for entry in metadata:
|
| 25 |
+
spaced_metadata.append(f" {entry} ")
|
| 26 |
+
text = spacing(text)
|
| 27 |
+
matched_entry_ids = []
|
| 28 |
+
for entry_id, entry in enumerate(spaced_metadata):
|
| 29 |
+
if entry in text:
|
| 30 |
+
matched_entry_ids.append(entry_id)
|
| 31 |
+
return matched_entry_ids
|
metadata.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|