rrivera1849 commited on
Commit
858fcb1
·
verified ·
1 Parent(s): 558e05b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +115 -0
README.md CHANGED
@@ -51,6 +51,121 @@ out, attentions = model(**tokenized_text, output_attentions=True)
51
  print(attentions[0].size()) # torch.Size([48, 12, 32, 32])
52
  ```
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ## Citing & Authors
55
 
56
  If you find this model helpful, feel free to cite our [publication](https://aclanthology.org/2021.emnlp-main.70.pdf).
 
51
  print(attentions[0].size()) # torch.Size([48, 12, 32, 32])
52
  ```
53
 
54
+ ## Usage (Batch)
55
+
56
+ Here's a more fleshed out example showing how to run LUAR across many batches of data:
57
+
58
+ ```python
59
+
60
+ import numpy as np
61
+ import torch
62
+ from termcolor import cprint
63
+ from transformers import AutoModel, AutoTokenizer
64
+ from tqdm import tqdm
65
+
66
+ def generate_data(num_batches: int = 100, batch_size: int = 32, num_samples_per_author: int = 16):
67
+ """
68
+ Generator that produces dummy data for testing.
69
+
70
+ Args:
71
+ num_batches (int): Total number of batches to yield.
72
+ batch_size (int): Number of authors per batch.
73
+ num_samples_per_author (int): Number of text samples per author.
74
+
75
+ Yields:
76
+ list: A batch of data structured as a list of lists of strings.
77
+ Shape: (batch_size, num_samples_per_author)
78
+ """
79
+ s = "This is an example string."
80
+ for batch in tqdm(range(num_batches)):
81
+ # Create a batch where each element is a list of 's' repeated 'num_samples_per_author' times
82
+ yield [[s] * num_samples_per_author for _ in range(batch_size)]
83
+
84
+ def flatten(l):
85
+ """
86
+ Helper function to flatten a 2D list into a 1D list.
87
+
88
+ Args:
89
+ l (list): List of lists.
90
+
91
+ Returns:
92
+ list: Flattened list.
93
+ """
94
+ return [item for sublist in l for item in sublist]
95
+
96
+ def main():
97
+ cprint("Starting LUAR-MUD example script...", 'magenta')
98
+
99
+ # --- Model Loading ---
100
+ cprint("Loading model 'rrivera1849/LUAR-MUD'...", 'blue')
101
+ # trust_remote_code=True is required for custom model architectures like LUAR-MUD
102
+ model = AutoModel.from_pretrained("rrivera1849/LUAR-MUD", trust_remote_code=True)
103
+
104
+ model.eval()
105
+
106
+ # Check for CUDA availability and move model to appropriate device
107
+ device = "cuda" if torch.cuda.is_available() else "cpu"
108
+ cprint(f"Moving model to device: {device}", 'yellow')
109
+ model.to(device)
110
+
111
+ # --- Tokenizer Loading ---
112
+ cprint("Loading tokenizer...", 'blue')
113
+ tokenizer = AutoTokenizer.from_pretrained("rrivera1849/LUAR-MUD", trust_remote_code=True)
114
+
115
+ # --- Configuration ---
116
+ num_batches = 100
117
+ batch_size = 32
118
+ num_samples_per_author = 16
119
+ max_length = 512
120
+
121
+ cprint("\nConfiguration:", 'cyan')
122
+ print(f" Batch Size: {batch_size}")
123
+ print(f" Samples per Author: {num_samples_per_author}")
124
+ print(f" Max Length: {max_length}")
125
+ print(f" Device: {device}\n")
126
+
127
+ all_outputs = []
128
+
129
+ cprint("Starting inference loop...", 'green')
130
+
131
+ # context manager for disabling gradient calculation to save memory/compute
132
+ with torch.inference_mode():
133
+ for i, batch in enumerate(generate_data(num_batches=num_batches, batch_size=batch_size, num_samples_per_author=num_samples_per_author)):
134
+ if (i + 1) % 10 == 0:
135
+ print(f" Processing batch {i + 1}...")
136
+
137
+ # Flatten the batch structure for tokenization:
138
+ # (batch_size, num_samples) -> (batch_size * num_samples)
139
+ batch = flatten(batch)
140
+
141
+ # Tokenize the flattened batch
142
+ inputs = tokenizer(batch, return_tensors="pt", padding=True, max_length=max_length, truncation=True)
143
+
144
+ # Move inputs to the same device as the model
145
+ inputs = inputs.to(device)
146
+
147
+ # Reshape input_ids and attention_mask to match the model's expected 3D input:
148
+ # (batch_size, num_samples_per_author, sequence_length)
149
+ inputs["input_ids"] = inputs["input_ids"].reshape(batch_size, num_samples_per_author, -1)
150
+ inputs["attention_mask"] = inputs["attention_mask"].reshape(batch_size, num_samples_per_author, -1)
151
+
152
+ # Forward pass through the model
153
+ outputs = model(**inputs)
154
+
155
+ # Move outputs back to CPU and convert to numpy for storage
156
+ all_outputs.append(outputs.cpu().numpy())
157
+
158
+ # Concatenate all batch results into a single array
159
+ # axis=0 corresponds to the batch dimension
160
+ all_outputs = np.concatenate(all_outputs, axis=0)
161
+
162
+ cprint("\nInference complete!", 'green')
163
+ cprint(f"Final output shape: {all_outputs.shape}", attrs=['bold'])
164
+
165
+ if __name__ == "__main__":
166
+ main()
167
+ ```
168
+
169
  ## Citing & Authors
170
 
171
  If you find this model helpful, feel free to cite our [publication](https://aclanthology.org/2021.emnlp-main.70.pdf).