Spaces:
Build error
Build error
zhenyundeng
commited on
Commit
·
55ca411
1
Parent(s):
8c5fc49
udpate
Browse files
app.py
CHANGED
|
@@ -75,6 +75,80 @@ nlp = spacy.load("en_core_web_sm")
|
|
| 75 |
# all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
|
| 76 |
train_examples = json.load(open('averitec/data/train.json', 'r'))
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# print(train_examples[0]['claim'])
|
| 79 |
# ---------------------------------------------------------------------------
|
| 80 |
# ---------- Load pretrained models ----------
|
|
@@ -98,8 +172,8 @@ if torch.cuda.is_available():
|
|
| 98 |
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 99 |
|
| 100 |
# question generation
|
| 101 |
-
qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-
|
| 102 |
-
qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-
|
| 103 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
| 104 |
# qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
| 105 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
|
@@ -359,30 +433,30 @@ def QAprediction(claim, evidence, sources):
|
|
| 359 |
|
| 360 |
|
| 361 |
# ----------GoogleAPIretriever---------
|
| 362 |
-
def generate_reference_corpus(reference_file):
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
|
| 387 |
|
| 388 |
def doc2prompt(doc):
|
|
@@ -399,22 +473,15 @@ def docs2prompt(top_docs):
|
|
| 399 |
@spaces.GPU
|
| 400 |
def prompt_question_generation(test_claim, speaker="they", topk=10):
|
| 401 |
#
|
| 402 |
-
reference_file = "averitec/data/train.json"
|
| 403 |
-
tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
|
| 404 |
-
bm25 = BM25Okapi(tokenized_corpus)
|
| 405 |
-
|
| 406 |
-
# Define the bloom model:
|
| 407 |
-
accelerator = Accelerator()
|
| 408 |
-
# accel_device = accelerator.device
|
| 409 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 410 |
-
# tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
| 411 |
-
# model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
| 412 |
|
| 413 |
# --------------------------------------------------
|
| 414 |
# test claim
|
| 415 |
-
s =
|
| 416 |
top_n = np.argsort(s)[::-1][:topk]
|
| 417 |
-
docs = [
|
| 418 |
# --------------------------------------------------
|
| 419 |
|
| 420 |
prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
|
|
@@ -640,71 +707,30 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
|
|
| 640 |
return retrieve_evidence
|
| 641 |
|
| 642 |
|
| 643 |
-
def claim2prompts(example):
|
| 644 |
-
claim = example["claim"]
|
| 645 |
|
| 646 |
-
# claim_str = "Claim: " + claim + "||Evidence: "
|
| 647 |
-
claim_str = "Evidence: "
|
| 648 |
|
| 649 |
-
for question in example["questions"]:
|
| 650 |
-
q_text = question["question"].strip()
|
| 651 |
-
if len(q_text) == 0:
|
| 652 |
-
continue
|
| 653 |
-
|
| 654 |
-
if not q_text[-1] == "?":
|
| 655 |
-
q_text += "?"
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
|
| 672 |
-
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
def generate_step2_reference_corpus(reference_file):
|
| 676 |
-
# with open(reference_file) as f:
|
| 677 |
-
# train_examples = json.load(f)
|
| 678 |
-
|
| 679 |
-
prompt_corpus = []
|
| 680 |
-
tokenized_corpus = []
|
| 681 |
-
|
| 682 |
-
for example in train_examples:
|
| 683 |
-
for lookup_str, prompt in claim2prompts(example):
|
| 684 |
-
entry = nltk.word_tokenize(lookup_str)
|
| 685 |
-
tokenized_corpus.append(entry)
|
| 686 |
-
prompt_corpus.append(prompt)
|
| 687 |
-
|
| 688 |
-
return tokenized_corpus, prompt_corpus
|
| 689 |
|
| 690 |
@spaces.GPU
|
| 691 |
def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
|
| 692 |
#
|
| 693 |
-
reference_file = "averitec/data/train.json"
|
| 694 |
-
tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
|
| 695 |
-
prompt_bm25 = BM25Okapi(tokenized_corpus)
|
| 696 |
-
|
| 697 |
-
# Define the bloom model:
|
| 698 |
-
# accelerator = Accelerator()
|
| 699 |
-
# accel_device = accelerator.device
|
| 700 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 701 |
-
# tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
| 702 |
-
# model = BloomForCausalLM.from_pretrained(
|
| 703 |
-
# "bigscience/bloom-7b1",
|
| 704 |
-
# device_map="auto",
|
| 705 |
-
# torch_dtype=torch.bfloat16,
|
| 706 |
-
# offload_folder="./offload"
|
| 707 |
-
# )
|
| 708 |
|
| 709 |
#
|
| 710 |
tokenized_corpus = []
|
|
@@ -749,7 +775,7 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
|
|
| 749 |
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
| 750 |
prompt_n = 10
|
| 751 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
| 752 |
-
prompt_docs = [
|
| 753 |
|
| 754 |
claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
|
| 755 |
prompt = "\n\n".join(prompt_docs + [claim_prompt])
|
|
@@ -757,8 +783,8 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10,
|
|
| 757 |
|
| 758 |
inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
|
| 759 |
# inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
|
| 760 |
-
outputs = qg_model.generate(inputs["input_ids"], max_length=
|
| 761 |
-
|
| 762 |
tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
|
| 763 |
# We are not allowed to generate more than 250 characters:
|
| 764 |
tgt_text = tgt_text[:250]
|
|
|
|
| 75 |
# all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
|
| 76 |
train_examples = json.load(open('averitec/data/train.json', 'r'))
|
| 77 |
|
| 78 |
+
def claim2prompts(example):
|
| 79 |
+
claim = example["claim"]
|
| 80 |
+
|
| 81 |
+
# claim_str = "Claim: " + claim + "||Evidence: "
|
| 82 |
+
claim_str = "Evidence: "
|
| 83 |
+
|
| 84 |
+
for question in example["questions"]:
|
| 85 |
+
q_text = question["question"].strip()
|
| 86 |
+
if len(q_text) == 0:
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
if not q_text[-1] == "?":
|
| 90 |
+
q_text += "?"
|
| 91 |
+
|
| 92 |
+
answer_strings = []
|
| 93 |
+
|
| 94 |
+
for a in question["answers"]:
|
| 95 |
+
if a["answer_type"] in ["Extractive", "Abstractive"]:
|
| 96 |
+
answer_strings.append(a["answer"])
|
| 97 |
+
if a["answer_type"] == "Boolean":
|
| 98 |
+
answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
|
| 99 |
+
|
| 100 |
+
for a_text in answer_strings:
|
| 101 |
+
if not a_text[-1] in [".", "!", ":", "?"]:
|
| 102 |
+
a_text += "."
|
| 103 |
+
|
| 104 |
+
# prompt_lookup_str = claim + " " + a_text
|
| 105 |
+
prompt_lookup_str = a_text
|
| 106 |
+
this_q_claim_str = claim_str + " " + a_text.strip() + "||Question answered: " + q_text
|
| 107 |
+
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n"))
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def generate_reference_corpus(reference_file):
|
| 111 |
+
all_data_corpus = []
|
| 112 |
+
tokenized_corpus = []
|
| 113 |
+
|
| 114 |
+
for train_example in train_examples:
|
| 115 |
+
train_claim = train_example["claim"]
|
| 116 |
+
|
| 117 |
+
speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
|
| 118 |
+
train_example["speaker"]) > 1 else "they"
|
| 119 |
+
|
| 120 |
+
questions = [q["question"] for q in train_example["questions"]]
|
| 121 |
+
|
| 122 |
+
claim_dict_builder = {}
|
| 123 |
+
claim_dict_builder["claim"] = train_claim
|
| 124 |
+
claim_dict_builder["speaker"] = speaker
|
| 125 |
+
claim_dict_builder["questions"] = questions
|
| 126 |
+
|
| 127 |
+
tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
|
| 128 |
+
all_data_corpus.append(claim_dict_builder)
|
| 129 |
+
|
| 130 |
+
return tokenized_corpus, all_data_corpus
|
| 131 |
+
|
| 132 |
+
def generate_step2_reference_corpus(reference_file):
|
| 133 |
+
prompt_corpus = []
|
| 134 |
+
tokenized_corpus = []
|
| 135 |
+
|
| 136 |
+
for example in train_examples:
|
| 137 |
+
for lookup_str, prompt in claim2prompts(example):
|
| 138 |
+
entry = nltk.word_tokenize(lookup_str)
|
| 139 |
+
tokenized_corpus.append(entry)
|
| 140 |
+
prompt_corpus.append(prompt)
|
| 141 |
+
|
| 142 |
+
return tokenized_corpus, prompt_corpus
|
| 143 |
+
|
| 144 |
+
reference_file = "averitec/data/train.json"
|
| 145 |
+
tokenized_corpus0, all_data_corpus0 = generate_reference_corpus(reference_file)
|
| 146 |
+
qg_bm25 = BM25Okapi(tokenized_corpus0)
|
| 147 |
+
|
| 148 |
+
tokenized_corpus1, prompt_corpus1 = generate_step2_reference_corpus(reference_file)
|
| 149 |
+
prompt_bm25 = BM25Okapi(tokenized_corpus1)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
# print(train_examples[0]['claim'])
|
| 153 |
# ---------------------------------------------------------------------------
|
| 154 |
# ---------- Load pretrained models ----------
|
|
|
|
| 172 |
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 173 |
|
| 174 |
# question generation
|
| 175 |
+
qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-1b1")
|
| 176 |
+
qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-1b1", torch_dtype=torch.bfloat16).to('cuda')
|
| 177 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
| 178 |
# qg_tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
|
| 179 |
# qg_model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype=torch.bfloat16).to(device)
|
|
|
|
| 433 |
|
| 434 |
|
| 435 |
# ----------GoogleAPIretriever---------
|
| 436 |
+
# def generate_reference_corpus(reference_file):
|
| 437 |
+
# # with open(reference_file) as f:
|
| 438 |
+
# # train_examples = json.load(f)
|
| 439 |
+
#
|
| 440 |
+
# all_data_corpus = []
|
| 441 |
+
# tokenized_corpus = []
|
| 442 |
+
#
|
| 443 |
+
# for train_example in train_examples:
|
| 444 |
+
# train_claim = train_example["claim"]
|
| 445 |
+
#
|
| 446 |
+
# speaker = train_example["speaker"].strip() if train_example["speaker"] is not None and len(
|
| 447 |
+
# train_example["speaker"]) > 1 else "they"
|
| 448 |
+
#
|
| 449 |
+
# questions = [q["question"] for q in train_example["questions"]]
|
| 450 |
+
#
|
| 451 |
+
# claim_dict_builder = {}
|
| 452 |
+
# claim_dict_builder["claim"] = train_claim
|
| 453 |
+
# claim_dict_builder["speaker"] = speaker
|
| 454 |
+
# claim_dict_builder["questions"] = questions
|
| 455 |
+
#
|
| 456 |
+
# tokenized_corpus.append(nltk.word_tokenize(claim_dict_builder["claim"]))
|
| 457 |
+
# all_data_corpus.append(claim_dict_builder)
|
| 458 |
+
#
|
| 459 |
+
# return tokenized_corpus, all_data_corpus
|
| 460 |
|
| 461 |
|
| 462 |
def doc2prompt(doc):
|
|
|
|
| 473 |
@spaces.GPU
|
| 474 |
def prompt_question_generation(test_claim, speaker="they", topk=10):
|
| 475 |
#
|
| 476 |
+
# reference_file = "averitec/data/train.json"
|
| 477 |
+
# tokenized_corpus, all_data_corpus = generate_reference_corpus(reference_file)
|
| 478 |
+
# bm25 = BM25Okapi(tokenized_corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
# --------------------------------------------------
|
| 481 |
# test claim
|
| 482 |
+
s = qg_bm25.get_scores(nltk.word_tokenize(test_claim))
|
| 483 |
top_n = np.argsort(s)[::-1][:topk]
|
| 484 |
+
docs = [all_data_corpus0[i] for i in top_n]
|
| 485 |
# --------------------------------------------------
|
| 486 |
|
| 487 |
prompt = docs2prompt(docs) + "\n\n" + "Outrageously, " + speaker + " claimed that \"" + test_claim.strip() + \
|
|
|
|
| 707 |
return retrieve_evidence
|
| 708 |
|
| 709 |
|
|
|
|
|
|
|
| 710 |
|
|
|
|
|
|
|
| 711 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
+
# def generate_step2_reference_corpus(reference_file):
|
| 714 |
+
# # with open(reference_file) as f:
|
| 715 |
+
# # train_examples = json.load(f)
|
| 716 |
+
#
|
| 717 |
+
# prompt_corpus = []
|
| 718 |
+
# tokenized_corpus = []
|
| 719 |
+
#
|
| 720 |
+
# for example in train_examples:
|
| 721 |
+
# for lookup_str, prompt in claim2prompts(example):
|
| 722 |
+
# entry = nltk.word_tokenize(lookup_str)
|
| 723 |
+
# tokenized_corpus.append(entry)
|
| 724 |
+
# prompt_corpus.append(prompt)
|
| 725 |
+
#
|
| 726 |
+
# return tokenized_corpus, prompt_corpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
@spaces.GPU
|
| 729 |
def decorate_with_questions(claim, retrieve_evidence, top_k=3): # top_k=5, 10, 100
|
| 730 |
#
|
| 731 |
+
# reference_file = "averitec/data/train.json"
|
| 732 |
+
# tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
|
| 733 |
+
# prompt_bm25 = BM25Okapi(tokenized_corpus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
#
|
| 736 |
tokenized_corpus = []
|
|
|
|
| 775 |
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
| 776 |
prompt_n = 10
|
| 777 |
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
| 778 |
+
prompt_docs = [prompt_corpus1[i] for i in prompt_top_n]
|
| 779 |
|
| 780 |
claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
|
| 781 |
prompt = "\n\n".join(prompt_docs + [claim_prompt])
|
|
|
|
| 783 |
|
| 784 |
inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(qg_model.device)
|
| 785 |
# inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
|
| 786 |
+
outputs = qg_model.generate(inputs["input_ids"], max_length=2000, num_beams=2, no_repeat_ngram_size=2, early_stopping=True)
|
| 787 |
+
# max_length=5000
|
| 788 |
tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
|
| 789 |
# We are not allowed to generate more than 250 characters:
|
| 790 |
tgt_text = tgt_text[:250]
|