Spaces:

AhmedSSabir
/

Demo-for-Gender-Score-jp

Sleeping

App Files Files Community

AhmedSSabir commited on Apr 19, 2024

Commit

1a49712

verified ·

1 Parent(s): b8bfd60

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -36

app.py CHANGED Viewed

@@ -12,6 +12,18 @@ import torch
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 from transformers import T5Tokenizer, AutoModelForCausalLM
 import torch
 from transformers import BertJapaneseTokenizer, BertModel
 import torch
@@ -88,6 +100,31 @@ def softmax(x):
 tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt-1b")
 model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
 #model  =  gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
 #model.eval()
@@ -98,45 +135,45 @@ model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
 #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
-def cloze_prob(text):
-	whole_text_encoding = tokenizer.encode(text)
-	# Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
-	text_list = text.split()
-	stem = ' '.join(text_list[:-1])
-	stem_encoding = tokenizer.encode(stem)
-	# cw_encoding is just the difference between whole_text_encoding and stem_encoding
-	# note: this might not correspond exactly to the word itself
-	cw_encoding = whole_text_encoding[len(stem_encoding):]
-	# Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
-	# Put the whole text encoding into a tensor, and get the model's comprehensive output
-	tokens_tensor = torch.tensor([whole_text_encoding])
-	with torch.no_grad():
-		outputs = model(tokens_tensor)
-		predictions = outputs[0]
-	logprobs = []
-	# start at the stem and get downstream probabilities incrementally from the model(see above)
-	start = -1-len(cw_encoding)
-	for j in range(start,-1,1):
-			raw_output = []
-			for i in predictions[-1][j]:
-					raw_output.append(i.item())
-			logprobs.append(np.log(softmax(raw_output)))
-	# if the critical word is three tokens long, the raw_probabilities should look something like this:
-	# [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
-	# Then for the i'th token we want to find its associated probability
-	# this is just: raw_probabilities[i][token_index]
-	conditional_probs = []
-	for cw,prob in zip(cw_encoding,logprobs):
-			conditional_probs.append(prob[cw])
-	# now that you have all the relevant probabilities, return their product.
-	# This is the probability of the critical word given the context before it.
-	return np.exp(np.sum(conditional_probs))
@@ -172,8 +209,12 @@ def Visual_re_ranker(sentence_man, sentence_woman, context_label, context_prob):
     sim_w = get_sim(sim_w)
-    LM_man = cloze_prob(sentence_man)
-    LM_woman = cloze_prob(sentence_woman)
     score_man     = pow(float(LM_man),pow((1-float(sim_m))/(1+ float(sim_m)),1-float(context_prob)))
     score_woman   = pow(float(LM_woman),pow((1-float(sim_w))/(1+ float(sim_w)),1-float(context_prob)))

 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 from transformers import T5Tokenizer, AutoModelForCausalLM
 import torch
+from doctest import OutputChecker
+import sys
+import torch
+import re
+import os
+import gradio as gr
+import requests
+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from torch.nn.functional import softmax
+import numpy as np
 from transformers import BertJapaneseTokenizer, BertModel
 import torch
 tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt-1b")
 model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt-1b")
+def sentence_prob_mean(text):
+    # Tokenize the input text and add special tokens
+    input_ids = tokenizer.encode(text, return_tensors='pt')
+    # Obtain model outputs
+    with torch.no_grad():
+        outputs = model(input_ids, labels=input_ids)
+        logits = outputs.logits  # logits are the model outputs before applying softmax
+    # Shift logits and labels so that tokens are aligned:
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = input_ids[..., 1:].contiguous()
+    # Calculate the softmax probabilities
+    probs = softmax(shift_logits, dim=-1)
+    # Gather the probabilities of the actual token IDs
+    gathered_probs = torch.gather(probs, 2, shift_labels.unsqueeze(-1)).squeeze(-1)
+    # Compute the mean probability across the tokens
+    mean_prob = torch.mean(gathered_probs).item()
+    return mean_prob
 #model  =  gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
 #model.eval()
 #tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
+# def cloze_prob(text):
+# 	whole_text_encoding = tokenizer.encode(text)
+# 	# Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
+# 	text_list = text.split()
+# 	stem = ' '.join(text_list[:-1])
+# 	stem_encoding = tokenizer.encode(stem)
+# 	# cw_encoding is just the difference between whole_text_encoding and stem_encoding
+# 	# note: this might not correspond exactly to the word itself
+# 	cw_encoding = whole_text_encoding[len(stem_encoding):]
+# 	# Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
+# 	# Put the whole text encoding into a tensor, and get the model's comprehensive output
+# 	tokens_tensor = torch.tensor([whole_text_encoding])
+# 	with torch.no_grad():
+# 		outputs = model(tokens_tensor)
+# 		predictions = outputs[0]
+# 	logprobs = []
+# 	# start at the stem and get downstream probabilities incrementally from the model(see above)
+# 	start = -1-len(cw_encoding)
+# 	for j in range(start,-1,1):
+# 			raw_output = []
+# 			for i in predictions[-1][j]:
+# 					raw_output.append(i.item())
+# 			logprobs.append(np.log(softmax(raw_output)))
+# 	# if the critical word is three tokens long, the raw_probabilities should look something like this:
+# 	# [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
+# 	# Then for the i'th token we want to find its associated probability
+# 	# this is just: raw_probabilities[i][token_index]
+# 	conditional_probs = []
+# 	for cw,prob in zip(cw_encoding,logprobs):
+# 			conditional_probs.append(prob[cw])
+# 	# now that you have all the relevant probabilities, return their product.
+# 	# This is the probability of the critical word given the context before it.
+# 	return np.exp(np.sum(conditional_probs))
     sim_w = get_sim(sim_w)
+    LM_man = sentence_prob_mean(sentence_man)
+    LM_woman = sentence_prob_mean(sentence_woman)
+    #LM_man = cloze_prob(sentence_man)
+    #LM_woman = cloze_prob(sentence_woman)
     score_man     = pow(float(LM_man),pow((1-float(sim_m))/(1+ float(sim_m)),1-float(context_prob)))
     score_woman   = pow(float(LM_woman),pow((1-float(sim_w))/(1+ float(sim_w)),1-float(context_prob)))