Spaces:

vives
/

bert_auto_tagging

Runtime error

App Files Files Community

vives commited on May 24, 2022

Commit

2168bad

1 Parent(s): 6a4d8b5

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -7

app.py CHANGED Viewed

@@ -40,17 +40,17 @@ def get_transcript(file):
     transcript = data['results'].values[1][0]['transcript']
     transcript = transcript.lower()
     return transcript
-def concat_tokens(sentences):
-  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
-  for sentence, values in sentences.items():
-      weight = values['weight']
       # encode each sentence and append to dictionary
       new_tokens = tokenizer.encode_plus(sentence, max_length=64,
                                          truncation=True, padding='max_length',
                                          return_tensors='pt')
       tokens['input_ids'].append(new_tokens['input_ids'][0])
       tokens['attention_mask'].append(new_tokens['attention_mask'][0])
-      tokens['KPS'][sentence] = weight
   # reformat list of tensors into single tensor
   tokens['input_ids'] = torch.stack(tokens['input_ids'])
   tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
@@ -59,7 +59,7 @@ def concat_tokens(sentences):
 """preprocess tags"""
 if tags:
   tags = [x.lower().strip() for x in tags.split(",")]
-  tags_tokens = concat_tokens(tags)
   tags_tokens.pop("KPS")
   with torch.no_grad():
     outputs_tags = model(**tags_tokens)
@@ -70,7 +70,22 @@ if tags:
 """Code related with processing text, extracting KPs, and doing distance to tag"""
 def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
   sim_dict = {}
   pools = pool_embeddings_count(out, tokens, idx).detach().numpy()

     transcript = data['results'].values[1][0]['transcript']
     transcript = transcript.lower()
     return transcript
+def concat_tokens_tags(sentences):
+  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': []}
+  for sentence in sentences:
       # encode each sentence and append to dictionary
       new_tokens = tokenizer.encode_plus(sentence, max_length=64,
                                          truncation=True, padding='max_length',
                                          return_tensors='pt')
       tokens['input_ids'].append(new_tokens['input_ids'][0])
       tokens['attention_mask'].append(new_tokens['attention_mask'][0])
+      tokens['KPS'].append(sentence)
   # reformat list of tensors into single tensor
   tokens['input_ids'] = torch.stack(tokens['input_ids'])
   tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
 """preprocess tags"""
 if tags:
   tags = [x.lower().strip() for x in tags.split(",")]
+  tags_tokens = concat_tokens_tags(tags)
   tags_tokens.pop("KPS")
   with torch.no_grad():
     outputs_tags = model(**tags_tokens)
 """Code related with processing text, extracting KPs, and doing distance to tag"""
+def concat_tokens(sentences):
+  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
+  for sentence, values in sentences.items():
+      weight = values['weight']
+      # encode each sentence and append to dictionary
+      new_tokens = tokenizer.encode_plus(sentence, max_length=64,
+                                         truncation=True, padding='max_length',
+                                         return_tensors='pt')
+      tokens['input_ids'].append(new_tokens['input_ids'][0])
+      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
+      tokens['KPS'][sentence] = weight
+  # reformat list of tensors into single tensor
+  tokens['input_ids'] = torch.stack(tokens['input_ids'])
+  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
+  return tokens
 def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
   sim_dict = {}
   pools = pool_embeddings_count(out, tokens, idx).detach().numpy()