ptrdvn commited on
Commit
04b7749
·
verified ·
1 Parent(s): 829b861

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - am
5
+ - ar
6
+ - bg
7
+ - bn
8
+ - cs
9
+ - da
10
+ - de
11
+ - el
12
+ - en
13
+ - es
14
+ - fa
15
+ - fi
16
+ - fr
17
+ - gu
18
+ - ha
19
+ - hi
20
+ - hu
21
+ - id
22
+ - it
23
+ - ja
24
+ - jv
25
+ - kn
26
+ - ko
27
+ - lt
28
+ - mr
29
+ - nl
30
+ - 'no'
31
+ - yo
32
+ - zh
33
+ - pl
34
+ - pt
35
+ - ro
36
+ - ru
37
+ - sk
38
+ - sv
39
+ - sw
40
+ - ta
41
+ - te
42
+ - th
43
+ - tr
44
+ - uk
45
+ - ur
46
+ - vi
47
+ - tl
48
+ ---
49
+
50
+ # Shitsu
51
+
52
+ <p align="center">
53
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/64b63f8ad57e02621dc93c8b/Lkw-M8a-AAfJiC81DobNl.jpeg" alt="A logo of a Shit Zhu reading a book" width="400"/>
54
+ </p>
55
+
56
+ A text scorer which scores text based on the amount of useful, textbook-like information in it.
57
+ It outputs a score generally between 0 and 1 but can exceed both of these bounds as it is a regressor.
58
+
59
+ Our model is based on fasttext embeddings, meaning that it can be used on large amounts of data with limited compute quickly.
60
+
61
+ This scorer can be used to filter useful information from large text corpora in many languages.
62
+
63
+ This model can also be found on [Github](https://github.com/lightblue-tech/shitsu).
64
+
65
+ # How to use
66
+
67
+ ### With our scorer package
68
+
69
+ ```bash
70
+ pip install git+https://github.com/lightblue-tech/shitsu.git
71
+ ```
72
+
73
+ ```python
74
+ from shitsu import ShitsuScorer
75
+
76
+ text_list = [
77
+ "Photosynthesis is a system of biological processes by which photosynthetic organisms, such as most plants, algae, and cyanobacteria, convert light energy, typically from sunlight, into the chemical energy necessary to fuel their metabolism.",
78
+ "Congratulations! You have all been selected to receive a free gift card worth $1000. Click on this link [Link] to claim your reward now. Limited time offer, so act fast! Don't miss out on this amazing opportunity."]
79
+
80
+ # Choose a language from one of: ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yo', 'zh']
81
+ language_code = "en"
82
+ scorer = ShitsuScorer(language_code)
83
+ scores = scorer.score(text_list)
84
+ scores
85
+ # array([ 0.9897383 , -0.08109612], dtype=float32)
86
+ ```
87
+
88
+ ### Without our scorer package (i.e. without pip install)
89
+
90
+ <details>
91
+ <summary>Show full code</summary>
92
+
93
+ ```python
94
+
95
+ from safetensors.torch import load_model
96
+ import fasttext
97
+ from huggingface_hub import hf_hub_download
98
+ from tqdm.auto import tqdm
99
+ import torch
100
+ import numpy as np
101
+ import torch
102
+ import torch.nn as nn
103
+
104
+ class FasttextEmbedRegressor(nn.Module):
105
+ def __init__(self, input_size=300):
106
+ super(FasttextEmbedRegressor, self).__init__()
107
+ layer_1_size = 64
108
+ layer_2_size = 32
109
+ self.fc1 = nn.Linear(input_size, layer_1_size)
110
+ self.fc2 = nn.Linear(layer_1_size, layer_2_size)
111
+ self.fc3 = nn.Linear(layer_2_size, 1)
112
+
113
+ def forward(self, x):
114
+ x = torch.relu(self.fc1(x))
115
+ x = torch.relu(self.fc2(x))
116
+ x = self.fc3(x)
117
+ return x
118
+
119
+ class ShitsuScorer:
120
+ def __init__(self, lang_code):
121
+ fasttext_model_path = hf_hub_download(repo_id=f"facebook/fasttext-{lang_code}-vectors", filename="model.bin")
122
+ self.fasttext_model = fasttext.load_model(fasttext_model_path)
123
+ self.regressor_model = FasttextEmbedRegressor().eval()
124
+ regressor_model_path = hf_hub_download(repo_id=f"lightblue/shitsu_text_scorer", filename=f"{lang_code}.safetensors")
125
+ load_model(self.regressor_model, regressor_model_path)
126
+
127
+ def score(self, text_list):
128
+ embeddings = np.stack([self.fasttext_model.get_sentence_vector(x.replace("\n", " ")) for x in tqdm(text_list)])
129
+ return self.regressor_model(torch.Tensor(embeddings)).detach().numpy().flatten()
130
+
131
+ text_list = [
132
+ "Photosynthesis is a system of biological processes by which photosynthetic organisms, such as most plants, algae, and cyanobacteria, convert light energy, typically from sunlight, into the chemical energy necessary to fuel their metabolism.",
133
+ "Congratulations! You have all been selected to receive a free gift card worth $1000. Click on this link [Link] to claim your reward now. Limited time offer, so act fast! Don't miss out on this amazing opportunity."]
134
+
135
+ scorer = ShitsuScorer("en")
136
+ scores = scorer.score(text_list)
137
+ scores
138
+ # array([ 0.9897383 , -0.08109612], dtype=float32)
139
+ ```
140
+
141
+ </details>
142
+ <br/>
143
+
144
+
145
+
146
+ # How we made the training data
147
+
148
+ We provided a sample of tens of thousands [MADLAD-400](https://huggingface.co/datasets/allenai/MADLAD-400) in various languages to a popular state-of-the-art LLM with the following system prompt:
149
+
150
+ ```python
151
+ system_message = """You are a text filtering AI model.
152
+ Your input is a piece of text.
153
+ Your output is a score of how likely the text is to appear in a useful {language} textbook, encyclopedia, or any other important document.
154
+
155
+ Output your score on a scale of 0-100, with 0 meaning that the text contains no useful {language} information and 100 meaning that the text is very useful and is exceedingly likely to appear in a {language} textbook, encyclopedia, or any other important document. If the text is not mostly fluent, natural {language}, output 0.
156
+
157
+ Your output should be only an integer from 0-100."""
158
+ ```
159
+
160
+ This resulted in the dataset found at [lightblue/text_ratings](https://huggingface.co/datasets/lightblue/text_ratings).
161
+
162
+ We then trained a small neural network on top of fasttext's embeddings to predict these scores.
163
+
164
+ We chose the 44 languages in this dataset by making a union set of the 30 most popular languages on earth as according to [Ethnologue 2024](https://www.ethnologue.com/insights/ethnologue200/) and the 30 most popular languages within MADLAD-400.
am.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:489522571130c58c649d6d6dd375ecba28a1ee42dd3357d6f77724424b04defe
3
+ size 85940
ar.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac733353ec859d1810b841f1d7915dbaa843de7b3dd8058033146b3d2410ef3d
3
+ size 85940
bg.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4452b85a7f0525833deaa4114e0dbc1790683a6980e8c10be649fcc65c5c9622
3
+ size 85940
bn.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:691f5e6ca3bf96398281089057618bfb9950a012bd6364c5a317ee872b7db2af
3
+ size 85940
cs.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948c1c7fd49da5c3e20eb9a966cd6d22b507c5985934f264acbc1824ee7e70f6
3
+ size 85940
da.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d94153a85e7e02f8818f908cff6f38d648e48a457d2e889ea63b70d6b06edb3
3
+ size 85940
de.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:273e3dd597651aba2508a0c9c67e5cfd27cfa9e3f4479c7b3c8a61cb08d242e8
3
+ size 85940
el.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5764ea4dccac430bed7ab127a22093b6c1ec11b3c79bea247ac98e0d4fca0f1e
3
+ size 85940
en.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de4a3e3d282988d2ec3ef041838f7493d617b377bff89506c932ef91a4167aa5
3
+ size 85940
es.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed420ffb402ce792fd082a6e463bf19af7d117f1d654508e0f870dae960520b
3
+ size 85940
fa.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97acc1c2337f24ee768496c5c1356c25164d5885fda2737ddf7c67751a6224ac
3
+ size 85940
fi.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eebd991819a5a0e4b6db4dcc3795a1daa83f8045b568550fd7a31b4ffdd04922
3
+ size 85940
fr.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0289d27fcd65a80b35268e20d99ca6ab531c05d2f1e814608aa3e2bca7a8cc84
3
+ size 85940
gu.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2256815704e4624c608f8718bf685887ca1e4644110c20f7f35c1857c72f9e94
3
+ size 85940
ha.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c615366f3399709ebec863060a4120cf3d330d0eb52c4547e1bb96865c56fb73
3
+ size 85940
hi.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76372b8c14e3160a34ec32346c9858ecd8ba0776e617bd4ca47c5553d1dd2dee
3
+ size 85940
hu.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d342ed20e03ad1d69db0e65e192411282299caf4930d2d46efa55531194efb8
3
+ size 85940
id.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a195de27c63f51661a7963e289ecb69c557ba4f9d605b93001347e2aaf3da8c8
3
+ size 85940
it.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32e562589fa29ee6b82bd3264d869479c8f01e19579ebb3acb01703c41a94873
3
+ size 85940
ja.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:645535860bb96b5533fc619e1705109db882ae4942bc4e6fc9341c1a61bd2f79
3
+ size 85940
jv.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77f4bcdba518f4fcad654600a6e8e80d9da6cc014693509e51fa94efda138478
3
+ size 85940
kn.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40733b980553f437b44cd8abd465549dcb398955dfae0afe0540492b35e28cb9
3
+ size 85940
ko.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6c875bfc3b17aae81007288d89731f44820e355e6920e7b5ba6dfc7ec2f7a21
3
+ size 85940
lt.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46aa6488501fdf098a78e71716de32f88e77b5e05d879e8a05c3796d50625d57
3
+ size 85940
mr.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73949c67f05cc10b4cb2a2a3d43f587fec28228044e2b010e351acefc49f4abe
3
+ size 85940
nl.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:253014d78d3513cc1fd8fa98a4eade5a3c1ab10f6a734fa6e04d5eb71224d056
3
+ size 85940
no.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3efc89d7ae4161faacbca7c772bc27a485d4738c597aa35d805106f9af04859
3
+ size 85940
pl.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce49f5a81160f945139d10a4891cfb4584f24e8235666d5fc1ce026a98671ad
3
+ size 85940
pt.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33be636d1f94cc7611b327179aca34b615a6962b9967bac21c619fe0fa022a2b
3
+ size 85940
ro.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf60d087b14f1f0c7c9add7d63b4f5feb01fa854a20c8b6be892de89b2e3156
3
+ size 85940
ru.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39742ac8cd7c34f500b68042eae26e5399f8b3b76a3b94d811ca9d42ec55a547
3
+ size 85940
sk.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3e2b121f7ad1786f0c4ebed86f6fbcf711ff60c33112dba88819f1f14a90cd
3
+ size 85940
sv.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e0f8c075d0ea131c70f3a09edcdc7f40a462b31925744eff72c903ac3b86b8
3
+ size 85940
sw.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b72bd1110ea970e0e8b086b99bb53eb308178524545c6d1aaca59e7e9d0916b
3
+ size 85940
ta.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2218afceb03907685fb9b5faeff6a6bff1316a47f004deeeb85640ba791876f1
3
+ size 85940
te.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:885877a1795b41b3548c246a65fc779f1e6f3bc25059f634d72e1eadcf99b356
3
+ size 85940
th.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa81c5582a38c0d5455a1396e58c8949c89554bc97c926d8c39e88fc2dc9fce4
3
+ size 85940
tl.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8553139fc548428902a8992f0dc5713ab091c212f263da7bb33b8007810a76c5
3
+ size 85940
tr.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d52fb2a6d7aa68c80bda9d2d07acd12da684a97e3cbbcbe985cb3e856c0c0040
3
+ size 85940
training.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+ import fasttext
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm.auto import tqdm, trange
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.optim as optim
10
+ from sklearn.metrics import mean_squared_error
11
+ import requests
12
+ import gzip
13
+ import shutil
14
+ import os
15
+ from safetensors.torch import save_model
16
+ import matplotlib.pyplot as plt
17
+
18
+ class FasttextEmbedRegressor(nn.Module):
19
+ def __init__(self, input_size=300):
20
+ super(FasttextEmbedRegressor, self).__init__()
21
+ layer_1_size = 64
22
+ layer_2_size = 32
23
+ self.fc1 = nn.Linear(input_size, layer_1_size)
24
+ self.fc2 = nn.Linear(layer_1_size, layer_2_size)
25
+ self.fc3 = nn.Linear(layer_2_size, 1)
26
+
27
+ def forward(self, x):
28
+ x = torch.relu(self.fc1(x))
29
+ x = torch.relu(self.fc2(x))
30
+ x = self.fc3(x)
31
+ return x
32
+
33
+ def train_regressor(X_train, X_test, y_train, y_test, train_epochs):
34
+
35
+ # Initialize the model, loss function, and optimizer
36
+ input_size = X_train.shape[1]
37
+ model = FasttextEmbedRegressor(input_size)
38
+ criterion = nn.MSELoss()
39
+ optimizer = optim.Adam(model.parameters(), lr=0.001)
40
+ batch_size = 32
41
+
42
+ training_metrics = []
43
+
44
+ for epoch in trange(train_epochs):
45
+ model.train()
46
+ train_losses = []
47
+ for step_num, i in enumerate(trange(0, X_train.shape[0], batch_size)):
48
+ vectors = torch.Tensor(X_train[i:i+batch_size])
49
+ targets = torch.Tensor(y_train[i:i+batch_size])
50
+ optimizer.zero_grad()
51
+ outputs = model(vectors).squeeze()
52
+ loss = criterion(outputs, targets)
53
+ loss.backward()
54
+ optimizer.step()
55
+ train_losses.append(float(loss))
56
+ if step_num % 10 == 0:
57
+ model.eval()
58
+ test_preds = model(torch.Tensor(X_test)).detach().numpy()
59
+ test_mse = mean_squared_error(y_test, test_preds)
60
+ training_metrics.append({
61
+ "epoch": epoch,
62
+ "step_num": step_num,
63
+ "i": i,
64
+ "test_mse": test_mse,
65
+ "train_loss": sum(train_losses) / len(train_losses),
66
+ })
67
+ train_losses = []
68
+ model.train()
69
+
70
+ return pd.DataFrame(training_metrics), model
71
+
72
+ def download_file(url, filename):
73
+ """
74
+ Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
75
+ """
76
+ chunkSize = 1024
77
+ r = requests.get(url, stream=True)
78
+ with open(filename, 'wb') as f:
79
+ pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
80
+ for chunk in r.iter_content(chunk_size=chunkSize):
81
+ if chunk: # filter out keep-alive new chunks
82
+ pbar.update (len(chunk))
83
+ f.write(chunk)
84
+ return filename
85
+
86
+ get_filename = lambda x: f"cc.{x}.300.bin"
87
+
88
+ def download_fasttext_vectors(lang_code):
89
+ filename = get_filename(lang_code)
90
+
91
+ if os.path.isfile(filename):
92
+ return None
93
+
94
+ print(f"Downloading {lang_code} vectors")
95
+ download_file(f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{filename}.gz", f"{filename}.gz")
96
+
97
+ print(f"Unzipping {lang_code} vectors")
98
+ with gzip.open(f"{filename}.gz", 'rb') as f_in:
99
+ with open(filename, 'wb') as f_out:
100
+ shutil.copyfileobj(f_in, f_out)
101
+
102
+ print(f"Removing zipped {lang_code} vectors")
103
+ os.remove(f"{filename}.gz")
104
+
105
+ return True
106
+
107
+ def create_quality_eval_model(lang_code, train_epochs=10):
108
+
109
+ download_fasttext_vectors(lang_code)
110
+
111
+ dataset = load_dataset("lightblue/text_ratings", lang_code, split="train")
112
+ text_list = dataset["selected_chunk"]
113
+ label_float = [x / 100 for x in dataset["rating_float"]]
114
+
115
+ fasttext_model = fasttext.load_model(f"cc.{lang_code}.300.bin")
116
+
117
+ embeddings = np.stack([fasttext_model.get_sentence_vector(
118
+ x.replace("\n", " ")
119
+ ) for x in tqdm(text_list)])
120
+
121
+ X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
122
+ embeddings,
123
+ label_float,
124
+ text_list,
125
+ test_size=0.2,
126
+ random_state=42
127
+ )
128
+
129
+ metrics_df, model = train_regressor(X_train, X_test, y_train, y_test, train_epochs)
130
+
131
+ test_df = pd.DataFrame({
132
+ "text": text_test,
133
+ "gold_score": y_test,
134
+ "pred_score": model(torch.Tensor(X_test)).detach().numpy().flatten()
135
+ })
136
+
137
+ save_model(model, f"{lang_code}.safetensors")
138
+
139
+ os.remove(get_filename(lang_code))
140
+
141
+ return metrics_df, test_df
142
+
143
+ if __name__ == '__main__':
144
+
145
+ langs = ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'yo', 'zh']
146
+
147
+ for l in langs:
148
+ print(l)
149
+ metrics_df, test_df = create_quality_eval_model(l, train_epochs=5)
150
+ print(l)
151
+ metrics_df[["test_mse", "train_loss"]].rolling(50).mean().plot()
152
+ plt.show()
uk.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f159f5d8f8c5b0119924aee84550f333380ec184667063be0b4476ad236b557
3
+ size 85940
ur.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e8df6db2d462135560360e394d831b0718c37989d149e2831ad42554389da0f
3
+ size 85940
vi.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ce79064155b5468dab67fc1567cbfeae48e60812617c7ff19132bd9acedf2f
3
+ size 85940
yo.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db79885d6f7ce795956c0dae5c0c23176799432734423246338ed6de2396460
3
+ size 85940
zh.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cef569e9b680e7a45421f4fbc82070a35e9e9317a4d5cbb399ba27b9fc6587ba
3
+ size 85940