| |
| import kenlm |
| from datasets import load_dataset |
| from tqdm import tqdm |
| import pandas as pd |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
|
|
|
|
| TOTAL_SENTENCES = 20000 |
| def pp(log_score, length): |
| return 10.0 ** (-log_score / length) |
|
|
|
|
| embedder = "distiluse-base-multilingual-cased-v1" |
| embedder_model = SentenceTransformer(embedder) |
| embedding_shape = embedder_model.encode(["foo"])[0].shape[0] |
| |
| model = kenlm.Model("es.arpa.bin") |
| mc4 = load_dataset("mc4", "es", streaming=True) |
| count = 0 |
| embeddings = [] |
| lenghts = [] |
| perplexities = [] |
| sentences = [] |
|
|
| for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992): |
| lines = sample["text"].split("\n") |
| for line in lines: |
| count += 1 |
| log_score = model.score(line) |
| length = len(line.split()) + 1 |
| embedding = embedder_model.encode([line])[0] |
| embeddings.append(embedding.tolist()) |
| perplexities.append(pp(log_score, length)) |
| lenghts.append(length) |
| sentences.append(line) |
| if count == TOTAL_SENTENCES: |
| break |
| if count == TOTAL_SENTENCES: |
| embeddings = np.array(embeddings) |
| df = pd.DataFrame({"sentence": sentences, "length": lenghts, "perplexity": perplexities}) |
| for dim in range(embedding_shape): |
| df[f"dim_{dim}"] = embeddings[:, dim] |
| df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t") |
| print("DONE!") |
| break |
|
|