|
|
--- |
|
|
license: apache-2.0 |
|
|
--- |
|
|
These are basic classifiers and a BM25 index of Wikipedia used for data tooling research. |
|
|
Using kenhktsui/llm-data-textbook-quality-fasttext-classifer-v1's classifier (MIT) and TurkuNLP's register classifiers. |
|
|
|
|
|
``` |
|
|
import fasttext, os |
|
|
if not os.path.exists("expert_classify.ftz"): |
|
|
os.system("wget http://dl.turkunlp.org/register-labeling-model/fasttext_model.bin") |
|
|
os.system("wget https://huggingface.co/ontocord/riverbed/resolve/main/rj_model.bin") |
|
|
os.system("wget https://huggingface.co/kenhktsui/llm-data-textbook-quality-fasttext-classifer-v1/resolve/main/model_textbook_quality.bin") |
|
|
os.system("wget https://huggingface.co/ontocord/riverbed/resolve/main/expert_classify.ftz") |
|
|
|
|
|
### red pajama filter. pred_label "__label__wiki" is data we do not wish to keep. |
|
|
red_pajama_model = fasttext.load_model("rj_model.bin") |
|
|
(pred_label, pred_prob) = red_pajama_model.predict(text) |
|
|
if pred_label == "__label__cc": |
|
|
pred_prob = 1 - pred_prob |
|
|
|
|
|
|
|
|
### turkunlp registry labeler: https://github.com/TurkuNLP/register-labeling |
|
|
domain_model = fasttext.load_model("fasttext_model.bin") |
|
|
(pred_label, pred_prob) = domain_model.predict(text) |
|
|
|
|
|
### Pile domain such as github, arxiv, etc. |
|
|
pile_model = fasttext.load_model("expert_classify.ftz") |
|
|
(pred_label, pred_prob) = pile_model.predict(text) |
|
|
|
|
|
### Textbook quality - e.g., textbooks are all you need |
|
|
textbook_model = fasttext.load_model("model_textbook_quality.bin") |
|
|
(pred_label, pred_prob) = pile_model.predict(text) |
|
|
|
|
|
``` |
|
|
|
|
|
See the files here: https://huggingface.co/ontocord/riverbed/tree/main |
|
|
|
|
|
|
|
|
This includes a a small whoosh search index of wikidata useful for background knowledge for LLMs. |
|
|
|
|
|
installation: |
|
|
```import os |
|
|
|
|
|
if not os.path.exists("./wikidata_bm25_whoosh"): |
|
|
os.system("git clone https://huggingface.co/ontocord/riverbed") |
|
|
os.system("pip install -q whoosh") |
|
|
import whoosh.index as whoosh_index |
|
|
from whoosh.qparser import QueryParser |
|
|
from whoosh.analysis import StemmingAnalyzer, Filter |
|
|
class MyFilter(Filter): |
|
|
def __call__(self, tokens): |
|
|
|
|
|
for t in tokens: |
|
|
t.text = t.text.lower() |
|
|
if len(t.text) > 5: |
|
|
yield t |
|
|
t.text = t.text[:5] |
|
|
yield t |
|
|
|
|
|
try: |
|
|
if qp is None: assert False |
|
|
except: |
|
|
bm25_dir = "./riverbed" |
|
|
index = whoosh_index.open_dir(bm25_dir) |
|
|
searcher = index.searcher() |
|
|
qp = QueryParser("content", schema=index.schema) |
|
|
``` |