File size: 6,459 Bytes
ed15737 2441c62 9f8e415 ed15737 2441c62 68e1a65 2441c62 376029b 2441c62 47450e4 2441c62 858fcb1 2441c62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
---
language:
- en
license: apache-2.0
---
# rrivera1849/LUAR-MUD
Author Style Representations using [LUAR](https://aclanthology.org/2021.emnlp-main.70.pdf).
The LUAR training and evaluation repository can be found [here](https://github.com/llnl/luar).
This model was trained on the Reddit Million User Dataset (MUD) found [here](https://aclanthology.org/2021.naacl-main.415.pdf).
## Usage
```python
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-MUD")
model = AutoModel.from_pretrained("rrivera1849/LUAR-MUD")
# we embed `episodes`, a colletion of documents presumed to come from an author
# NOTE: make sure that `episode_length` consistent across `episode`
batch_size = 3
episode_length = 16
text = [
["Foo"] * episode_length,
["Bar"] * episode_length,
["Zoo"] * episode_length,
]
text = [j for i in text for j in i]
tokenized_text = tokenizer(
text,
max_length=32,
padding="max_length",
truncation=True,
return_tensors="pt"
)
# inputs size: (batch_size, episode_length, max_token_length)
tokenized_text["input_ids"] = tokenized_text["input_ids"].reshape(batch_size, episode_length, -1)
tokenized_text["attention_mask"] = tokenized_text["attention_mask"].reshape(batch_size, episode_length, -1)
print(tokenized_text["input_ids"].size()) # torch.Size([3, 16, 32])
print(tokenized_text["attention_mask"].size()) # torch.Size([3, 16, 32])
out = model(**tokenized_text)
print(out.size()) # torch.Size([3, 512])
# to get the Transformer attentions:
out, attentions = model(**tokenized_text, output_attentions=True)
print(attentions[0].size()) # torch.Size([48, 12, 32, 32])
```
## Usage (Batch)
Here's a more fleshed out example showing how to run LUAR across many batches of data:
```python
import numpy as np
import torch
from termcolor import cprint
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
def generate_data(num_batches: int = 100, batch_size: int = 32, num_samples_per_author: int = 16):
"""
Generator that produces dummy data for testing.
Args:
num_batches (int): Total number of batches to yield.
batch_size (int): Number of authors per batch.
num_samples_per_author (int): Number of text samples per author.
Yields:
list: A batch of data structured as a list of lists of strings.
Shape: (batch_size, num_samples_per_author)
"""
s = "This is an example string."
for batch in tqdm(range(num_batches)):
# Create a batch where each element is a list of 's' repeated 'num_samples_per_author' times
yield [[s] * num_samples_per_author for _ in range(batch_size)]
def flatten(l):
"""
Helper function to flatten a 2D list into a 1D list.
Args:
l (list): List of lists.
Returns:
list: Flattened list.
"""
return [item for sublist in l for item in sublist]
def main():
cprint("Starting LUAR-MUD example script...", 'magenta')
# --- Model Loading ---
cprint("Loading model 'rrivera1849/LUAR-MUD'...", 'blue')
# trust_remote_code=True is required for custom model architectures like LUAR-MUD
model = AutoModel.from_pretrained("rrivera1849/LUAR-MUD", trust_remote_code=True)
model.eval()
# Check for CUDA availability and move model to appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
cprint(f"Moving model to device: {device}", 'yellow')
model.to(device)
# --- Tokenizer Loading ---
cprint("Loading tokenizer...", 'blue')
tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-MUD", trust_remote_code=True)
# --- Configuration ---
num_batches = 100
batch_size = 32
num_samples_per_author = 16
max_length = 512
cprint("\nConfiguration:", 'cyan')
print(f" Batch Size: {batch_size}")
print(f" Samples per Author: {num_samples_per_author}")
print(f" Max Length: {max_length}")
print(f" Device: {device}\n")
all_outputs = []
cprint("Starting inference loop...", 'green')
# context manager for disabling gradient calculation to save memory/compute
with torch.inference_mode():
for i, batch in enumerate(generate_data(num_batches=num_batches, batch_size=batch_size, num_samples_per_author=num_samples_per_author)):
if (i + 1) % 10 == 0:
print(f" Processing batch {i + 1}...")
# Flatten the batch structure for tokenization:
# (batch_size, num_samples) -> (batch_size * num_samples)
batch = flatten(batch)
# Tokenize the flattened batch
inputs = tokenizer(batch, return_tensors="pt", padding=True, max_length=max_length, truncation=True)
# Move inputs to the same device as the model
inputs = inputs.to(device)
# Reshape input_ids and attention_mask to match the model's expected 3D input:
# (batch_size, num_samples_per_author, sequence_length)
inputs["input_ids"] = inputs["input_ids"].reshape(batch_size, num_samples_per_author, -1)
inputs["attention_mask"] = inputs["attention_mask"].reshape(batch_size, num_samples_per_author, -1)
# Forward pass through the model
outputs = model(**inputs)
# Move outputs back to CPU and convert to numpy for storage
all_outputs.append(outputs.cpu().numpy())
# Concatenate all batch results into a single array
# axis=0 corresponds to the batch dimension
all_outputs = np.concatenate(all_outputs, axis=0)
cprint("\nInference complete!", 'green')
cprint(f"Final output shape: {all_outputs.shape}", attrs=['bold'])
if __name__ == "__main__":
main()
```
## Citing & Authors
If you find this model helpful, feel free to cite our [publication](https://aclanthology.org/2021.emnlp-main.70.pdf).
```
@inproceedings{uar-emnlp2021,
author = {Rafael A. Rivera Soto and Olivia Miano and Juanita Ordonez and Barry Chen and Aleem Khan and Marcus Bishop and Nicholas Andrews},
title = {Learning Universal Authorship Representations},
booktitle = {EMNLP},
year = {2021},
}
```
## License
LUAR is distributed under the terms of the Apache License (Version 2.0).
All new contributions must be made under the Apache-2.0 licenses. |