Spaces:

kambris
/

SoLProject

Runtime error

App Files Files Community

kambris commited on Dec 6, 2024

Commit

2198b18

verified ·

1 Parent(s): fcc17a2

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -28

app.py CHANGED Viewed

@@ -234,45 +234,47 @@ def classify_emotion(text, classifier):
     return final_emotion
 def get_embedding_for_text(text, tokenizer, model):
-    """Get embedding for complete text."""
-    # Get the raw tokens first
     tokens = tokenizer.tokenize(text)
-    # Process in chunks of exactly 510 tokens (512 - 2 special tokens)
     chunk_size = 510
-    chunks = []
     for i in range(0, len(tokens), chunk_size):
-        chunk = tokens[i:i + chunk_size]
-        token_ids = tokenizer.convert_tokens_to_ids(chunk)
-        # Add special tokens manually
-        token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
         # Create attention mask
-        attention_mask = [1] * len(token_ids)
-        # Pad if needed
-        padding_length = 512 - len(token_ids)
-        if padding_length > 0:
-            token_ids = token_ids + ([tokenizer.pad_token_id] * padding_length)
-            attention_mask = attention_mask + ([0] * padding_length)
-        chunks.append({
-            'input_ids': torch.tensor([token_ids]),
-            'attention_mask': torch.tensor([attention_mask])
-        })
-    # Get embeddings
-    chunk_embeddings = []
-    for chunk in chunks:
-        chunk = {k: v.to(model.device) for k, v in chunk.items()}
         with torch.no_grad():
-            outputs = model(**chunk)[0]
-            embedding = outputs[:, 0, :].cpu().numpy()
-            chunk_embeddings.append(embedding[0])
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []

     return final_emotion
 def get_embedding_for_text(text, tokenizer, model):
+    """Get embedding for complete text by processing in chunks."""
+    # Tokenize the entire text
     tokens = tokenizer.tokenize(text)
+    # Process in chunks of 510 tokens (512 - 2 special tokens)
     chunk_size = 510
+    chunk_embeddings = []
     for i in range(0, len(tokens), chunk_size):
+        # Take a chunk of tokens
+        chunk_tokens = tokens[i:i + chunk_size]
+        # Add special tokens
+        chunk_tokens = ['[CLS]'] + chunk_tokens + ['[SEP]']
+        # Convert to input IDs
+        input_ids = tokenizer.convert_tokens_to_ids(chunk_tokens)
+        # Pad to 512 tokens
+        input_ids += [tokenizer.pad_token_id] * (512 - len(input_ids))
         # Create attention mask
+        attention_mask = [1] * len(chunk_tokens) + [0] * (512 - len(chunk_tokens))
+        # Convert to tensors
+        input_ids = torch.tensor([input_ids])
+        attention_mask = torch.tensor([attention_mask])
+        # Get embedding for this chunk
         with torch.no_grad():
+            outputs = model(input_ids, attention_mask=attention_mask)
+            chunk_embedding = outputs[0][:, 0, :].cpu().numpy()
+            chunk_embeddings.append(chunk_embedding[0])
+    # Average embeddings from all chunks
     if chunk_embeddings:
         return np.mean(chunk_embeddings, axis=0)
+    # Fallback if no embeddings could be generated
     return np.zeros(model.config.hidden_size)
 def format_topics(topic_model, topic_counts):
     """Format topics for display."""
     formatted_topics = []