Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -235,35 +235,37 @@ def classify_emotion(text, classifier):
|
|
| 235 |
|
| 236 |
def get_embedding_for_text(text, tokenizer, model):
|
| 237 |
"""Get embedding for complete text."""
|
| 238 |
-
#
|
| 239 |
-
|
| 240 |
-
all_tokens = encoded['input_ids'][0]
|
| 241 |
|
| 242 |
-
#
|
| 243 |
chunk_size = 510
|
| 244 |
chunks = []
|
| 245 |
|
| 246 |
-
for i in range(0, len(
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
# Get embeddings
|
| 257 |
chunk_embeddings = []
|
| 258 |
for chunk in chunks:
|
| 259 |
-
|
| 260 |
-
inputs = {
|
| 261 |
-
'input_ids': chunk.unsqueeze(0).to(model.device),
|
| 262 |
-
'attention_mask': torch.ones_like(chunk.unsqueeze(0)).to(model.device)
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
with torch.no_grad():
|
| 266 |
-
outputs = model(**
|
| 267 |
embedding = outputs[:, 0, :].cpu().numpy()
|
| 268 |
chunk_embeddings.append(embedding[0])
|
| 269 |
|
|
|
|
| 235 |
|
| 236 |
def get_embedding_for_text(text, tokenizer, model):
|
| 237 |
"""Get embedding for complete text."""
|
| 238 |
+
# Get the raw tokens first
|
| 239 |
+
tokens = tokenizer.tokenize(text)
|
|
|
|
| 240 |
|
| 241 |
+
# Process in chunks of exactly 510 tokens (512 - 2 special tokens)
|
| 242 |
chunk_size = 510
|
| 243 |
chunks = []
|
| 244 |
|
| 245 |
+
for i in range(0, len(tokens), chunk_size):
|
| 246 |
+
chunk = tokens[i:i + chunk_size]
|
| 247 |
+
token_ids = tokenizer.convert_tokens_to_ids(chunk)
|
| 248 |
+
# Add special tokens manually
|
| 249 |
+
token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
|
| 250 |
+
# Create attention mask
|
| 251 |
+
attention_mask = [1] * len(token_ids)
|
| 252 |
+
# Pad if needed
|
| 253 |
+
padding_length = 512 - len(token_ids)
|
| 254 |
+
if padding_length > 0:
|
| 255 |
+
token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
|
| 256 |
+
attention_mask = attention_mask + ([0] * padding_length)
|
| 257 |
+
|
| 258 |
+
chunks.append({
|
| 259 |
+
'input_ids': torch.tensor([token_ids]),
|
| 260 |
+
'attention_mask': torch.tensor([attention_mask])
|
| 261 |
+
})
|
| 262 |
|
| 263 |
+
# Get embeddings
|
| 264 |
chunk_embeddings = []
|
| 265 |
for chunk in chunks:
|
| 266 |
+
chunk = {k: v.to(model.device) for k, v in chunk.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
with torch.no_grad():
|
| 268 |
+
outputs = model(**chunk)[0]
|
| 269 |
embedding = outputs[:, 0, :].cpu().numpy()
|
| 270 |
chunk_embeddings.append(embedding[0])
|
| 271 |
|