Spaces:

riyadhrazzaq
/

text-compression-using-lm

Configuration error

App Files Files Community

riyadhrazzaq commited on Mar 12, 2024

Commit

d541e5a

0 Parent(s):

added inference scripts, model and vocab

Browse files

Files changed (12) hide show

.gitattributes +1 -0
.gitignore +161 -0
.idea/.gitignore +8 -0
main.py +20 -0
model_lr0.0001_bs256_epoch50.pt +3 -0
src/__init__.py +0 -0
src/evaluator.py +15 -0
src/model.py +81 -0
src/tokenizer.py +72 -0
src/util.py +88 -0
test.py +25 -0
vocab.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,161 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

main.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import argparse
+from src.evaluator import evaluate
+def main():
+    # parser
+    parser = argparse.ArgumentParser(description='inference with model.')
+    parser.add_argument('--checkpoint', type=str, help='Path to the checkpoint file')
+    parser.add_argument("--decompress", action="store_true", help="decompress the input text")
+    parser.add_argument('--vocab', type=str, help='Path to the vocab file')
+    parser.add_argument('--text', type=str, help='Text to be tokenized')
+    args = parser.parse_args()
+    # load model and vocab
+    evaluate(args)
+if __name__ == "__main__":
+    main()

model_lr0.0001_bs256_epoch50.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31923ca96e3c2471ad6252dfb615b15cde784be5a7792c7379d1c9a9b27a7f4e
+size 551468733

src/__init__.py ADDED Viewed

File without changes

src/evaluator.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from src.model import Model2
+from src.tokenizer import Tokenizer
+from src.util import *
+def evaluate(args):
+    vocab = torch.load(args.vocab, map_location=torch.device('cpu'))
+    model = Model2(len(vocab), 300, 256, vocab['<PAD>'])
+    load_from_checkpoint(model, args.checkpoint)
+    print()
+    if args.decompress:
+        print(decompress(args.text, Tokenizer(vocab), model))
+    else:
+        print(compress(args.text, Tokenizer(vocab), model))

src/model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from torch import nn
+from src.util import device
+class Transpose(nn.Module):
+    def __init__(self, dim0=None, dim1=None):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, tensor):
+        if self.dim0 is None:
+            self.dim0 = tensor.dim() - 2
+            self.dim1 = tensor.dim() - 1
+        return torch.transpose(tensor, self.dim0, self.dim1)
+class Model2(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        embedding_dim,
+        state_size,
+        pad_index,
+    ):
+        super().__init__()
+        self.state_size = state_size
+        self.pad_index = pad_index
+        self.embedding_layer = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=embedding_dim,
+            padding_idx=pad_index,
+        )
+        self.rnn_layer = nn.LSTMCell(input_size=embedding_dim, hidden_size=state_size)
+        self.lin1 = nn.Sequential(
+            nn.Linear(state_size, state_size * 4),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+        )
+        self.lin2 = nn.Sequential(
+            nn.Linear(state_size * 4, state_size * 8),
+            Transpose(),
+            nn.BatchNorm1d(state_size * 8),
+            Transpose(),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+        )
+        self.lin3 = nn.Sequential(
+            nn.Linear(state_size * 8, state_size * 16),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+        )
+        self.lin4 = nn.Sequential(nn.Linear(state_size * 16, vocab_size))
+    def forward(self, X):
+        N, T = X.shape
+        non_pad_mask = X != self.pad_index
+        X = self.embedding_layer(X)
+        state = torch.zeros((N, self.state_size), device=device)
+        c = torch.zeros((N, self.state_size), device=device)
+        states = []
+        for t in range(T):
+            next_state, next_c = self.rnn_layer(X[:, t, :], (state, c))
+            # print(non_pad_mask[:, t].reshape(-1, 1).shape, next_state.shape, state.shape)
+            state = torch.where(non_pad_mask[:, t].reshape(-1, 1), next_state, state)
+            c = torch.where(non_pad_mask[:, t].reshape(-1, 1), next_c, c)
+            states.append(state)
+        # (N, T, states)
+        states = torch.stack(states, dim=1)
+        output = self.lin1(states)
+        output = self.lin2(output)
+        output = self.lin3(output)
+        output = self.lin4(output)
+        return output

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+from typing import List, Tuple
+import numpy as np
+import torch
+from torchtext.vocab import Vocab
+from torch import nn, Tensor
+from src.util import device
+class Tokenizer(nn.Module):
+    def __init__(self, vocab: str | Vocab):
+        super().__init__()
+        # check vocab file exists
+        if isinstance(vocab, str):
+            assert os.path.exists(vocab)
+            self.vocab = torch.load(vocab, map_location=device)
+        else:
+            self.vocab = vocab
+        self.edge_index = vocab['<EDGE>']
+        self.pad_index = vocab['<PAD>']
+        self.unk_index = vocab['<UNK>']
+    def get_tensors(self, data):
+        """
+        Builds torch.Tensor from a variable length 2D python list. The return value is a tuple of two tensors, one for input and the other for output.
+        Parameters
+        ----------
+        data: Nested list of token indices
+            [[1,2,3],
+             [4,2,3,4,2],
+             [223,4,2]]
+            This example has three sentences.
+        """
+        max_len = max([len(datum) for datum in data]) + 1
+        N = len(data)
+        X = np.full((N, max_len), self.pad_index, np.int64)
+        Y = np.full((N, max_len), self.pad_index, np.int64)
+        for i in range(N):
+            # prepend the inputs with edge token
+            X[i, 0] = self.edge_index
+            for j in range(len(data[i])):
+                X[i, j + 1] = data[i][j]
+                Y[i, j] = data[i][j]
+            # finish the outputs with edge token
+            Y[i, j] = self.edge_index
+        return torch.tensor(X, device=device), torch.tensor(Y, device=device)
+    def forward(self, text: List[str]) -> Tuple[Tensor, Tensor]:
+        """
+        Tokenizes a list of natural text. The return value is a tensor of token ids.
+        Parameters
+        ----------
+        text: List[str]. A list of natural language strings.
+        Returns
+        -------
+        torch.Tensor. A tensor of token ids.
+        """
+        text = [sentence.split() for sentence in text]
+        tokenized = [self.vocab(sentence) for sentence in text]
+        return self.get_tensors(tokenized)

src/util.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import torch
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def stringify(array):
+    return  '\n'.join([' '.join(inner_list) for inner_list in array])
+def compress(text, tokenizer, model):
+    """
+    tokenizer: Tokenizer.
+    text: str.
+        Each line represents a single document.
+    """
+    tokens = [sentence.split() for sentence in text.split("\n")]
+    indices, _ = tokenizer(text.split("\n"))
+    logits = model(indices)
+    next_token_predicted = logits.argmax(dim=2)
+    # slices are for skipping edge tokens
+    prediction_mask = indices[:, 1:] == next_token_predicted[:, :-1]
+    # replace correctly predicted tokens with "X"
+    for i, sentence_mask in enumerate(prediction_mask):
+        sentence_len = len(tokens[i])
+        for j, predicted_successfully in enumerate(sentence_mask):
+            # length check is to ignore pad tokens
+            if predicted_successfully and j < sentence_len and tokenizer.vocab[tokens[i][j]] != tokenizer.unk_index:
+                tokens[i][j] = "X"
+    sentences = [" ".join(sentence) for sentence in tokens]
+    document = "\n".join(sentences)
+    return document
+def decompress(text, tokenizer, model):
+    """
+    text: str.
+        Each line represents a single document.
+    """
+    sentence_tokens = [document.split() for document in text.split("\n")]
+    indices, _ = tokenizer(text.split("\n"))
+    uncompressed = []
+    for i, sentence in enumerate(sentence_tokens):
+        prefix = ['<EDGE>']
+        for j, token in enumerate(sentence):
+            if token != "X":
+                prefix.append(token)
+            else:
+                # only infer when X is found
+                indices = torch.tensor([tokenizer.vocab(prefix)],
+                                       dtype=torch.int,
+                                       device=device)
+                logits = model(indices)
+                # prediction logit for X
+                logit = logits[:, -1, :]
+                index = logit.argmax(dim=1)
+                prefix.append(tokenizer.vocab.lookup_token(index))
+        # reset prefix for new sentence
+        uncompressed.append(prefix[1:])
+    return stringify(uncompressed)
+def load_from_checkpoint(model, checkpoint_path):
+    """
+    Loads a model from a checkpoint.
+    Parameters:
+    ----------
+    checkpoint_path: The path to the checkpoint.
+    Raises:
+    ------
+    Exception: If no checkpoint is found in the provided path.
+    """
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
+        model.load_state_dict(checkpoint['model_state_dict'])
+        model.eval()
+        print(f"loaded existing model.")
+    else:
+        raise Exception("No checkpoint found in the provided path")

test.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from src.evaluator import evaluate
+import argparse
+parser = argparse.ArgumentParser(description='inference test with model.')
+parser.add_argument('--checkpoint', type=str, help='Path to the checkpoint file', default='model_lr0.0001_bs256_epoch50.pt')
+parser.add_argument("--decompress", action="store_true", help="decompress the input text", default=False)
+parser.add_argument('--vocab', type=str, help='Path to the vocab file', default='vocab.pt')
+parser.add_argument('--text', type=str, help='Text to be tokenized', default="""dr. tonie mcdonald is a life long levittown resident who taught and rose through the ranks of the district she now leads .
+he received his ba in chemistry , magna cum laude , from amherst college in 1 9 8 1 .""")
+args = parser.parse_args()
+print("--- input ---")
+print(args.text)
+# compress
+print("--- compress ---")
+evaluate(args)
+# decompress
+print("--- decompress ---")
+args.decompress = True
+args.text = """dr. tonie mcdonald is X life long levittown resident who taught and rose through X ranks of the district she now leads .
+he received his ba X chemistry X magna cum laude X from amherst college in X X 8 1 ."""
+evaluate(args)

vocab.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38847aa134accb833b3afc3204db2ce8650400907885a7efd3a1c541f58d3f0d
+size 133355