whisper

Runtime error

App Files Files Community

mskov commited on Dec 14, 2022

Commit

306f4a4

1 Parent(s): a564048

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -0

app.py CHANGED Viewed

@@ -9,6 +9,40 @@ from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 # from next_word_prediction import GPT2
 from share_btn import community_icon_html, loading_icon_html, share_js
 # get gpt2 model

 from transformers import AutoTokenizer
 # from next_word_prediction import GPT2
+### code
+gpt2 = AutoModelForCausalLM.from_pretrained("gpt2", return_dict_in_generate=True)
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+input_ids = tokenizer("Today is a nice day", return_tensors="pt").input_ids
+generated_outputs = gpt2.generate(input_ids, do_sample=True, num_return_sequences=3, output_scores=True)
+# only use id's that were generated
+# gen_sequences has shape [3, 15]
+gen_sequences = generated_outputs.sequences[:, input_ids.shape[-1]:]
+# let's stack the logits generated at each step to a tensor and transform
+# logits to probs
+probs = torch.stack(generated_outputs.scores, dim=1).softmax(-1)  # -> shape [3, 15, vocab_size]
+# now we need to collect the probability of the generated token
+# we need to add a dummy dim in the end to make gather work
+gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
+# now we can do all kinds of things with the probs
+# 1) the probs that exactly those sequences are generated again
+# those are normally going to be very small
+unique_prob_per_sequence = gen_probs.prod(-1)
+# 2) normalize the probs over the three sequences
+normed_gen_probs = gen_probs / gen_probs.sum(0)
+assert normed_gen_probs[:, 0].sum() == 1.0, "probs should be normalized"
+# 3) compare normalized probs to each other like in 1)
+unique_normed_prob_per_sequence = normed_gen_probs.prod(-1)
+### end code
 from share_btn import community_icon_html, loading_icon_html, share_js
 # get gpt2 model