Initial commit

Browse files

Files changed (9) hide show

.gitignore +1 -0
.gitmodules +3 -0
__init__.py +0 -0
data +1 -0
extract_ass.py +16 -0
model.py +284 -0
model_consts.py +9 -0
train.py +31 -0
utils.py +254 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "data"]
+	path = data
+	url = git@hf.co:datasets/metricv/metricsubs-segmenter

__init__.py ADDED Viewed

File without changes

data ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit f8f1b533b09e44d6b885dd9931a9a56f8f8ce319

extract_ass.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import ass
+import os
+import sys
+if __name__ == "__main__":
+    filename = sys.argv[1]
+    with open(filename, "r", encoding='utf-8-sig') as fin:
+        doc = ass.parse(fin)
+    for e in doc.events:
+        if isinstance(e, ass.Dialogue) and e.style == "英":
+            print(e.text.strip())

model.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from typing import Any
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+from os import listdir
+from os.path import isfile, join
+if __package__ == None or __package__ == "":
+    from utils import tag_training_data, get_upenn_tags_dict, parse_tags
+else:
+    from .utils import tag_training_data, get_upenn_tags_dict, parse_tags
+# Model Type 1: LSTM with 1-logit lookahead.
+class SegmentorDataset(Dataset):
+    def __init__(self, datapoints):
+        self.datapoints = [(torch.from_numpy(k).float(), torch.tensor([t]).float()) for k, t in datapoints]
+    def __len__(self):
+        return len(self.datapoints)
+    def __getitem__(self, idx):
+        return self.datapoints[idx][0], self.datapoints[idx][1]
+class RNN(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, device=None):
+        super(RNN, self).__init__()
+        if device == None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size, 1)
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=self.device)
+        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=self.device)
+        out, _ = self.rnn(x, (h0, c0))
+        out = out[:, -1, :]
+        out = self.fc(out)
+        return out
+# Model 2: Bidirectional LSTM with entire sequence context (hopefully)
+class SegmentorDatasetDirectTag(Dataset):
+    def __init__(self, document_root: str):
+        self.tags_dict = get_upenn_tags_dict()
+        self.datapoints = []
+        self.eye = np.eye(len(self.tags_dict))
+        files = listdir(document_root)
+        for f in files:
+            if f.endswith(".txt"):
+                fname = join(document_root, f)
+                print(f"Loaded datafile: {fname}")
+                reconstructed_tags = tag_training_data(fname)
+                input, tag = parse_tags(reconstructed_tags)
+                self.datapoints.append((
+                    np.array(input),
+                    np.array(tag)
+                ))
+    def __len__(self):
+        return len(self.datapoints)
+    def __getitem__(self, idx):
+        item = self.datapoints[idx]
+        return torch.from_numpy(self.eye[item[0]]).float(), torch.from_numpy(item[1]).float()
+# The same dataset without one-hot embedding of the input.
+class SegmentorDatasetNonEmbed(Dataset):
+    def __init__(self, document_root: str):
+        self.datapoints = []
+        files = listdir(document_root)
+        for f in files:
+            if f.endswith(".txt"):
+                fname = join(document_root, f)
+                print(f"Loaded datafile: {fname}")
+                reconstructed_tags = tag_training_data(fname)
+                input, tag = parse_tags(reconstructed_tags)
+                self.datapoints.append((
+                    np.array(input),
+                    np.array(tag)
+                ))
+    def __len__(self):
+        return len(self.datapoints)
+    def __getitem__(self, idx):
+        item = self.datapoints[idx]
+        return torch.from_numpy(item[0]).int(), torch.from_numpy(item[1]).float()
+class BidirLSTMSegmenter(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, device = None):
+        super(BidirLSTMSegmenter, self).__init__()
+        if device == None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, device = self.device)
+        self.fc = nn.Linear(2*hidden_size, 1, device = self.device)
+        self.final = nn.Sigmoid()
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size, device=self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size, device=self.device)
+        out, _ = self.rnn(x, (h0, c0))
+        # out_fced = [torch.zeros((out.shape[0], out.shape[1]), device=device)]
+        # # Shape of out: [batch, seq_length, 256 (num_directions * hidden_size)]
+        # for i in range(out.shape[1]):
+        #     out_fced[:, i] = self.fc(out[:, i, :])[0]
+        out_fced = self.fc(out)[:, :, 0]
+        # Shape of out:
+        return self.final(out_fced)
+class BidirLSTMSegmenterWithEmbedding(nn.Module):
+    def __init__(self, input_size, embedding_size, hidden_size, num_layers, device = None):
+        super(BidirLSTMSegmenterWithEmbedding, self).__init__()
+        if device == None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_size, device = self.device)
+        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, bidirectional=True, device = self.device)
+        self.fc = nn.Linear(2*hidden_size, 1, device = self.device)
+        self.final = nn.Sigmoid()
+    def forward(self, x):
+        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size, device=self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size, device=self.device)
+        embedded = self.embedding(x)
+        out, _ = self.rnn(embedded, (h0, c0))
+        # out_fced = [torch.zeros((out.shape[0], out.shape[1]), device=device)]
+        # # Shape of out: [batch, seq_length, 256 (num_directions * hidden_size)]
+        # for i in range(out.shape[1]):
+        #     out_fced[:, i] = self.fc(out[:, i, :])[0]
+        out_fced = self.fc(out)[:, :, 0]
+        # Shape of out:
+        return self.final(out_fced)
+def collate_fn_padd(batch):
+    '''
+    Padds batch of variable length
+    note: it converts things ToTensor manually here since the ToTensor transform
+    assume it takes in images rather than arbitrary tensors.
+    '''
+    ## get sequence lengths
+    inputs = [i[0] for i in batch]
+    tags = [i[1] for i in batch]
+    padded_input = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
+    combined_outputs = torch.nn.utils.rnn.pad_sequence(tags, batch_first=True)
+    ## compute mask
+    return (padded_input, combined_outputs)
+def get_dataloader(dataset: SegmentorDataset, batch_size):
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_padd)
+def train_model(model: RNN,
+    dataset,
+    lr = 1e-3,
+    num_epochs = 3,
+    batch_size = 100,
+):
+    train_loader = get_dataloader(dataset, batch_size=batch_size)
+    n_total_steps = len(train_loader)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    device = model.device
+    for epoch in range(num_epochs):
+        for i, (input, tags) in enumerate(train_loader):
+            input = input.to(device)
+            tags = tags.to(device)
+            outputs = model(input)
+            loss = criterion(outputs, tags)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if i%100 == 0:
+                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss [{loss.item():.4f}]")
+def train_bidirlstm_model(model: BidirLSTMSegmenter,
+    dataset: SegmentorDatasetDirectTag,
+    lr = 1e-3,
+    num_epochs = 3,
+    batch_size = 1,
+):
+    train_loader = get_dataloader(dataset, batch_size=batch_size)
+    n_total_steps = len(train_loader)
+    criterion = nn.BCELoss()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    device = model.device
+    for epoch in range(num_epochs):
+        for i, (input, tags) in enumerate(train_loader):
+            input = input.to(device)
+            tags = tags.to(device)
+            optimizer.zero_grad()
+            outputs = model(input)
+            loss = criterion(outputs, tags)
+            loss.backward()
+            optimizer.step()
+            if i%10 == 0:
+                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss [{loss.item():.4f}]")
+def train_bidirlstm_embedding_model(model: BidirLSTMSegmenterWithEmbedding,
+    dataset: SegmentorDatasetNonEmbed,
+    lr = 1e-3,
+    num_epochs = 3,
+    batch_size = 1,
+):
+    train_loader = get_dataloader(dataset, batch_size=batch_size)
+    n_total_steps = len(train_loader)
+    criterion = nn.BCELoss()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    device = model.device
+    for epoch in range(num_epochs):
+        for i, (input, tags) in enumerate(train_loader):
+            input = input.to(device)
+            tags = tags.to(device)
+            optimizer.zero_grad()
+            outputs = model(input)
+            loss = criterion(outputs, tags)
+            loss.backward()
+            optimizer.step()
+            if i%10 == 0:
+                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss [{loss.item():.4f}]")

model_consts.py ADDED Viewed

	@@ -0,0 +1,9 @@

+if __package__ == None or __package__ == "":
+    from utils import get_upenn_tags_dict
+else:
+    from .utils import get_upenn_tags_dict
+input_size = len(get_upenn_tags_dict())
+embedding_size = 128
+hidden_size = 128
+num_layers = 2

train.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import os
+if __package__ == None or __package__ == "":
+    from model import BidirLSTMSegmenter, SegmentorDatasetDirectTag, train_bidirlstm_model
+    from model import BidirLSTMSegmenterWithEmbedding, SegmentorDatasetNonEmbed, train_bidirlstm_embedding_model
+    from utils import get_upenn_tags_dict
+    from model_consts import input_size, embedding_size, hidden_size, num_layers
+    data_path = "data"
+else:
+    from .model import BidirLSTMSegmenter, SegmentorDatasetDirectTag, train_bidirlstm_model
+    from .model import BidirLSTMSegmenterWithEmbedding, SegmentorDatasetNonEmbed, train_bidirlstm_embedding_model
+    from .utils import get_upenn_tags_dict
+    from .model_consts import input_size, embedding_size, hidden_size, num_layers
+    data_path = "segmenter/data"
+device = "cuda"
+if __name__ == "__main__":
+    dataset = SegmentorDatasetNonEmbed(data_path)
+    model = BidirLSTMSegmenterWithEmbedding(input_size, embedding_size, hidden_size, num_layers, device)
+    if os.path.exists("segmenter.ckpt") and os.path.isfile("segmenter.ckpt"):
+        print("Loading checkpoint. If you want to start from scratch, remove segmenter.ckpt.")
+        model.load_state_dict(torch.load("segmenter.ckpt"))
+    model.to(device)
+    train_bidirlstm_embedding_model(model, dataset, num_epochs=150, batch_size=2)
+    torch.save(model.state_dict(), "segmenter.ckpt")

utils.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import nltk
+from nltk.tag import PerceptronTagger
+from stable_whisper.result import WordTiming
+import numpy as np
+import torch
+def bind_wordtimings_to_tags(wt: list[WordTiming]):
+    raw_words = [w.word for w in wt]
+    tokenized_raw_words = []
+    tokens_wordtiming_map = []
+    for word in raw_words:
+        tokens_word = nltk.word_tokenize(word)
+        tokenized_raw_words.extend(tokens_word)
+        tokens_wordtiming_map.append(len(tokens_word))
+    tagged_words = nltk.pos_tag(tokenized_raw_words)
+    grouped_tags = []
+    for k in tokens_wordtiming_map:
+        grouped_tags.append(tagged_words[:k])
+        tagged_words = tagged_words[k:]
+    tags_only = [tuple([w[1] for w in t]) for t in grouped_tags]
+    wordtimings_with_tags = zip(wt, tags_only)
+    return list(wordtimings_with_tags)
+def embed_tag_list(tags: list[str]):
+    tags_dict = get_upenn_tags_dict()
+    eye = np.eye(len(tags_dict))
+    return eye[np.array([tags_dict[tag] for tag in tags])]
+def lookup_tag_list(tags: list[str]):
+    tags_dict = get_upenn_tags_dict()
+    return np.array([tags_dict[tag] for tag in tags], dtype=int)
+def tag_training_data(filename: str):
+    with open(filename, "r") as f:
+        segmented_lines = f.readlines()
+    segmented_lines = [s.strip() for s in segmented_lines if s.strip() != ""]
+    # Regain the full text for more accurate tagging.
+    full_text = " ".join(segmented_lines)
+    tokenized_full_text = nltk.word_tokenize(full_text)
+    tagged_full_text = nltk.pos_tag(tokenized_full_text)
+    tagged_full_text_copy = tagged_full_text
+    reconstructed_tags = []
+    for line in segmented_lines:
+        line_nospace = line.replace(r" ", "")
+        found = False
+        for i in range(len(tagged_full_text_copy)+1):
+            rejoined = "".join([x[0] for x in tagged_full_text_copy[:i]])
+            if line_nospace == rejoined:
+                found = True
+                reconstructed_tags.append(tagged_full_text_copy[:i])
+                tagged_full_text_copy = tagged_full_text_copy[i:]
+                continue;
+        if found == False:
+            print("Panic. Cannot match further.")
+            print(f"Was trying to match: {line}")
+            print(tagged_full_text_copy)
+    return reconstructed_tags
+def get_upenn_tags_dict():
+    tagger = PerceptronTagger()
+    tags = list(tagger.tagdict.values())
+    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+    tags.extend(["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"])
+    tags = list(set(tags))
+    tags.sort()
+    tags.append("BREAK")
+    tags_dict = dict()
+    for index, tag in enumerate(tags):
+        tags_dict[tag] = index
+    return tags_dict
+def parse_tags(reconstructed_tags):
+    """
+        Parse reconstructed tags into input/tag datapoint.
+        In the original plan, this type of output is suitable for bidirectional LSTM.
+        Input:
+            reconstured_tags:
+                Tagged segments, from tag_training_data()
+                Example: [
+                    [('You', 'PRP'), ("'re", 'VBP'), ('back', 'RB'), ('again', 'RB'), ('?', '.')],
+                    [('You', 'PRP'),("'ve", 'VBP'), ('been', 'VBN'), ('consuming', 'VBG'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('tech', 'JJ'), ('news', 'NN'), ('lately', 'RB'), ('.', '.')]
+                    ...
+                ]
+        Output:
+            (input_tokens, output_tag)
+            input_tokens:
+                A sequence of tokens, each number corresponds to a type of word.
+                Example: [25, 38, 27, 27, 6, 25, 38, 37, 36, 10, 19, 13, 14, 19, 27, 6]
+            output_tags:
+                A sequence of 0 and 1, indicating whether a break should be inserted AFTER each location.
+                Example: [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
+    """
+    tags_dict = get_upenn_tags_dict()
+    all_tags_sequence = [[y[1] for y in segments] + ['BREAK'] for segments in reconstructed_tags]
+    all_tags_sequence = [tag for tags in all_tags_sequence for tag in tags]
+    input_tokens = []
+    output_tag = []
+    for token in all_tags_sequence:
+        if token != 'BREAK':
+            input_tokens.append(tags_dict[token])
+            output_tag.append(0)
+        else:
+            output_tag[-1] = 1
+    return input_tokens, output_tag
+def embed_segments(tagged_segments):
+    tags, tags_dict = get_upenn_tags_dict()
+    for index, tag in enumerate(tags):
+        tags_dict[tag] = index
+    result_embedding = []
+    classes = len(tags)
+    eye = np.eye(classes)
+    for segment in tagged_segments:
+        targets = np.array([tags_dict[tag] for word, tag in segment])
+        segment_embedding = eye[targets]
+        result_embedding.append(segment_embedding)
+        result_embedding.append(np.array([eye[tags_dict["BREAK"]]]))
+    result_embedding = np.concatenate(result_embedding)
+    return result_embedding, tags_dict
+def window_embedded_segments_rnn(embeddings, tags_dict):
+    datapoints = []
+    eye = np.eye(len(tags_dict))
+    break_vector = eye[tags_dict["BREAK"]]
+    for i in range(1, embeddings.shape[0]):
+        # Should we insert a break BEFORE token i?
+        if (embeddings[i] == break_vector).all():
+            continue
+        else:
+            prev_sequence = embeddings[:i]
+            if (prev_sequence[-1] == break_vector).all():
+                # It should break here. Remove the break and set tag as 1.
+                prev_sequence = prev_sequence[:-1]
+                tag = 1
+            else:
+                # It should not break here.
+                tag = 0
+            entire_sequence = np.concatenate((prev_sequence, np.array([embeddings[i]])))
+            datapoints.append((entire_sequence, tag))
+    return datapoints
+def print_dataset(datapoints, tags_dict, tokenized_full_text):
+    eye = np.eye(len(tags_dict))
+    break_vector = eye[tags_dict["BREAK"]]
+    for input, tag in datapoints:
+        if tag == 1:
+            print("[1] ", end='')
+        else:
+            print("[0] ", end='')
+        count = 0
+        for v in input:
+            if not (v == break_vector).all():
+                count += 1
+        # print(input)
+        # count = np.count_nonzero(input != break_vector)
+        segment = tokenized_full_text[:count]
+        print(segment)
+from stable_whisper.result import Segment # Just for typing
+def get_indicies(segment: Segment, model, device, threshold):
+    word_list = segment.words
+    tagged_wordtiming = bind_wordtimings_to_tags(word_list)
+    tag_list = [tag for twt in tagged_wordtiming for tag in twt[1]]
+    tag_per_word = [len(twt[1]) for twt in tagged_wordtiming]
+    embedded_tags = embed_tag_list(tag_list)
+    embedded_tags = torch.from_numpy(embedded_tags).float()
+    output = model(embedded_tags[None, :].to(device))
+    list_output = output.detach().cpu().numpy().tolist()[0]
+    current_index = 0
+    cut_indicies = []
+    for index, tags_count in enumerate(tag_per_word):
+        tags = list_output[current_index:current_index+tags_count]
+        if max(tags) > threshold:
+            cut_indicies.append(index)
+        current_index += tags_count
+    return cut_indicies
+def get_indicies_autoembed(segment: Segment, model, device, threshold):
+    word_list = segment.words
+    tagged_wordtiming = bind_wordtimings_to_tags(word_list)
+    tag_list = [tag for twt in tagged_wordtiming for tag in twt[1]]
+    tag_per_word = [len(twt[1]) for twt in tagged_wordtiming]
+    embedded_tags = lookup_tag_list(tag_list)
+    embedded_tags = torch.from_numpy(embedded_tags).int().to(device)
+    output = model(embedded_tags[None, :].to(device))
+    list_output = output.detach().cpu().numpy().tolist()[0]
+    current_index = 0
+    cut_indicies = []
+    for index, tags_count in enumerate(tag_per_word):
+        tags = list_output[current_index:current_index+tags_count]
+        if max(tags) > threshold:
+            cut_indicies.append(index)
+        current_index += tags_count
+    return cut_indicies