Spaces:
Runtime error
Runtime error
Jingxiang Mo commited on
Commit ·
77e7345
1
Parent(s): 4071dd4
Lint and code optimization
Browse files- .vscode/settings.json +5 -0
- __pycache__/app.cpython-39.pyc +0 -0
- app.py +74 -73
.vscode/settings.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"python.linting.pylintEnabled": true,
|
| 3 |
+
"python.linting.enabled": true,
|
| 4 |
+
"python.formatting.provider": "yapf"
|
| 5 |
+
}
|
__pycache__/app.cpython-39.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-39.pyc and b/__pycache__/app.cpython-39.pyc differ
|
|
|
app.py
CHANGED
|
@@ -3,16 +3,18 @@ import gradio as gr
|
|
| 3 |
import numpy as np
|
| 4 |
import wikipediaapi as wk
|
| 5 |
import wikipedia
|
|
|
|
| 6 |
from transformers import (
|
| 7 |
TokenClassificationPipeline,
|
| 8 |
AutoModelForTokenClassification,
|
| 9 |
AutoTokenizer,
|
| 10 |
BertForQuestionAnswering,
|
| 11 |
-
BertTokenizer
|
| 12 |
)
|
| 13 |
from transformers.pipelines import AggregationStrategy
|
| 14 |
import torch
|
| 15 |
|
|
|
|
| 16 |
# =====[ DEFINE PIPELINE ]===== #
|
| 17 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
| 18 |
def __init__(self, model, *args, **kwargs):
|
|
@@ -20,7 +22,7 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
|
| 20 |
model=AutoModelForTokenClassification.from_pretrained(model),
|
| 21 |
tokenizer=AutoTokenizer.from_pretrained(model),
|
| 22 |
*args,
|
| 23 |
-
**kwargs
|
| 24 |
)
|
| 25 |
|
| 26 |
def postprocess(self, model_outputs):
|
|
@@ -30,89 +32,109 @@ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
|
| 30 |
)
|
| 31 |
return np.unique([result.get("word").strip() for result in results])
|
| 32 |
|
|
|
|
| 33 |
# =====[ LOAD PIPELINE ]===== #
|
| 34 |
keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
|
| 35 |
extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
|
| 36 |
-
model = BertForQuestionAnswering.from_pretrained(
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def keyphrases_extraction(text: str) -> str:
|
| 40 |
keyphrases = extractor(text)
|
| 41 |
return keyphrases
|
| 42 |
|
|
|
|
| 43 |
def wikipedia_search(input: str) -> str:
|
| 44 |
input = input.replace("\n", " ")
|
| 45 |
keyphrases = keyphrases_extraction(input)
|
| 46 |
|
| 47 |
-
wiki = wk.Wikipedia(
|
| 48 |
-
|
| 49 |
-
try
|
| 50 |
if len(keyphrases) == 0:
|
| 51 |
return "Can you add more details to your question?"
|
| 52 |
-
|
| 53 |
query_suggestion = wikipedia.suggest(keyphrases[0])
|
| 54 |
-
if
|
| 55 |
results = wikipedia.search(query_suggestion)
|
| 56 |
else:
|
| 57 |
results = wikipedia.search(keyphrases[0])
|
| 58 |
|
| 59 |
index = 0
|
| 60 |
page = wiki.page(results[index])
|
| 61 |
-
while not (
|
| 62 |
index += 1
|
| 63 |
if index == len(results):
|
| 64 |
raise Exception
|
| 65 |
page = wiki.page(results[index])
|
| 66 |
return page.summary
|
| 67 |
-
|
| 68 |
except:
|
| 69 |
return "I cannot answer this question"
|
| 70 |
-
|
| 71 |
-
def answer_question(question):
|
| 72 |
|
|
|
|
|
|
|
| 73 |
context = wikipedia_search(question)
|
| 74 |
-
if (context == "I cannot answer this question") or (
|
|
|
|
|
|
|
| 75 |
return context
|
| 76 |
|
| 77 |
-
#
|
| 78 |
# Apply the tokenizer to the input text, treating them as a text-pair.
|
| 79 |
-
|
| 80 |
input_ids = tokenizer.encode(question, context)
|
| 81 |
-
question_ids = input_ids[:input_ids.index(tokenizer.sep_token_id)+1]
|
| 82 |
|
| 83 |
# Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
|
| 84 |
length_of_group = 512 - len(question_ids)
|
| 85 |
-
input_ids_without_question = input_ids[
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
input_ids_split = []
|
| 89 |
-
for group in range(len(input_ids_without_question)//length_of_group + 1):
|
| 90 |
-
input_ids_split.append(
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
scores = []
|
| 94 |
for input in input_ids_split:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
sep_index = input.index(tokenizer.sep_token_id)
|
| 98 |
-
|
| 99 |
-
# The number of segment A tokens includes the [SEP] token istelf.
|
| 100 |
num_seg_a = sep_index + 1
|
| 101 |
-
|
| 102 |
-
# The remainder are segment B.
|
| 103 |
-
num_seg_b = len(input) - num_seg_a
|
| 104 |
-
|
| 105 |
-
# Construct the list of 0s and 1s.
|
| 106 |
-
segment_ids = [0]*num_seg_a + [1]*num_seg_b
|
| 107 |
-
|
| 108 |
-
# There should be a segment_id for every input token.
|
| 109 |
assert len(segment_ids) == len(input)
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
start_scores = outputs.start_logits
|
| 118 |
end_scores = outputs.end_logits
|
|
@@ -123,53 +145,32 @@ def answer_question(question):
|
|
| 123 |
print(max_start_score)
|
| 124 |
print(max_end_score)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
# Find the tokens with the highest `start` and `end` scores.
|
| 128 |
-
|
| 129 |
-
answer_start = torch.argmax(start_scores)
|
| 130 |
-
answer_end = torch.argmax(end_scores)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# Get the string versions of the input tokens.
|
| 134 |
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
|
|
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
# Select the remaining answer tokens and join them with whitespace.
|
| 140 |
-
for i in range(answer_start + 1, answer_end + 1):
|
| 141 |
-
|
| 142 |
-
# If it's a subword token, then recombine it with the previous token.
|
| 143 |
-
if tokens[i][0:2] == '##':
|
| 144 |
answer += tokens[i][2:]
|
| 145 |
-
|
| 146 |
-
# Otherwise, add a space then the token.
|
| 147 |
else:
|
| 148 |
-
answer +=
|
| 149 |
-
|
| 150 |
scores.append((max_start_score, max_end_score, answer))
|
| 151 |
|
| 152 |
# Compare scores for answers found and each paragraph and pick the most relevant.
|
|
|
|
| 153 |
|
| 154 |
-
final_answer = max(scores, key=lambda x: x[0] + x[1])[2]
|
| 155 |
-
|
| 156 |
-
return final_answer
|
| 157 |
|
| 158 |
# =====[ DEFINE INTERFACE ]===== #'
|
| 159 |
title = "Azza Knowledge Agent"
|
| 160 |
-
examples = [
|
| 161 |
-
["Where is the Eiffel Tower?"],
|
| 162 |
-
["What is the population of France?"]
|
| 163 |
-
]
|
| 164 |
demo = gr.Interface(
|
| 165 |
-
title
|
| 166 |
-
|
| 167 |
fn=answer_question,
|
| 168 |
-
inputs
|
| 169 |
-
outputs
|
| 170 |
examples=examples,
|
| 171 |
allow_flagging="never",
|
| 172 |
-
|
| 173 |
|
| 174 |
if __name__ == "__main__":
|
| 175 |
-
demo.launch()
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import wikipediaapi as wk
|
| 5 |
import wikipedia
|
| 6 |
+
import openai
|
| 7 |
from transformers import (
|
| 8 |
TokenClassificationPipeline,
|
| 9 |
AutoModelForTokenClassification,
|
| 10 |
AutoTokenizer,
|
| 11 |
BertForQuestionAnswering,
|
| 12 |
+
BertTokenizer,
|
| 13 |
)
|
| 14 |
from transformers.pipelines import AggregationStrategy
|
| 15 |
import torch
|
| 16 |
|
| 17 |
+
|
| 18 |
# =====[ DEFINE PIPELINE ]===== #
|
| 19 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
| 20 |
def __init__(self, model, *args, **kwargs):
|
|
|
|
| 22 |
model=AutoModelForTokenClassification.from_pretrained(model),
|
| 23 |
tokenizer=AutoTokenizer.from_pretrained(model),
|
| 24 |
*args,
|
| 25 |
+
**kwargs,
|
| 26 |
)
|
| 27 |
|
| 28 |
def postprocess(self, model_outputs):
|
|
|
|
| 32 |
)
|
| 33 |
return np.unique([result.get("word").strip() for result in results])
|
| 34 |
|
| 35 |
+
|
| 36 |
# =====[ LOAD PIPELINE ]===== #
|
| 37 |
keyPhraseExtractionModel = "ml6team/keyphrase-extraction-kbir-inspec"
|
| 38 |
extractor = KeyphraseExtractionPipeline(model=keyPhraseExtractionModel)
|
| 39 |
+
model = BertForQuestionAnswering.from_pretrained(
|
| 40 |
+
"bert-large-uncased-whole-word-masking-finetuned-squad"
|
| 41 |
+
)
|
| 42 |
+
tokenizer = BertTokenizer.from_pretrained(
|
| 43 |
+
"bert-large-uncased-whole-word-masking-finetuned-squad"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
|
| 47 |
def keyphrases_extraction(text: str) -> str:
|
| 48 |
keyphrases = extractor(text)
|
| 49 |
return keyphrases
|
| 50 |
|
| 51 |
+
|
| 52 |
def wikipedia_search(input: str) -> str:
|
| 53 |
input = input.replace("\n", " ")
|
| 54 |
keyphrases = keyphrases_extraction(input)
|
| 55 |
|
| 56 |
+
wiki = wk.Wikipedia("en")
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
if len(keyphrases) == 0:
|
| 60 |
return "Can you add more details to your question?"
|
| 61 |
+
|
| 62 |
query_suggestion = wikipedia.suggest(keyphrases[0])
|
| 63 |
+
if query_suggestion != None:
|
| 64 |
results = wikipedia.search(query_suggestion)
|
| 65 |
else:
|
| 66 |
results = wikipedia.search(keyphrases[0])
|
| 67 |
|
| 68 |
index = 0
|
| 69 |
page = wiki.page(results[index])
|
| 70 |
+
while not ("." in page.summary) or not page.exists():
|
| 71 |
index += 1
|
| 72 |
if index == len(results):
|
| 73 |
raise Exception
|
| 74 |
page = wiki.page(results[index])
|
| 75 |
return page.summary
|
| 76 |
+
|
| 77 |
except:
|
| 78 |
return "I cannot answer this question"
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
|
| 81 |
+
def answer_question(question):
|
| 82 |
context = wikipedia_search(question)
|
| 83 |
+
if (context == "I cannot answer this question") or (
|
| 84 |
+
context == "Can you add more details to your question?"
|
| 85 |
+
):
|
| 86 |
return context
|
| 87 |
|
| 88 |
+
# Tokenize
|
| 89 |
# Apply the tokenizer to the input text, treating them as a text-pair.
|
|
|
|
| 90 |
input_ids = tokenizer.encode(question, context)
|
| 91 |
+
question_ids = input_ids[: input_ids.index(tokenizer.sep_token_id) + 1]
|
| 92 |
|
| 93 |
# Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
|
| 94 |
length_of_group = 512 - len(question_ids)
|
| 95 |
+
input_ids_without_question = input_ids[
|
| 96 |
+
input_ids.index(tokenizer.sep_token_id) + 1 :
|
| 97 |
+
]
|
| 98 |
+
print(
|
| 99 |
+
f"Query has {len(input_ids)} tokens, divided in {len(input_ids_without_question)//length_of_group + 1}.\n"
|
| 100 |
+
)
|
| 101 |
|
| 102 |
input_ids_split = []
|
| 103 |
+
for group in range(len(input_ids_without_question) // length_of_group + 1):
|
| 104 |
+
input_ids_split.append(
|
| 105 |
+
question_ids
|
| 106 |
+
+ input_ids_without_question[
|
| 107 |
+
length_of_group * group : length_of_group * (group + 1) - 1
|
| 108 |
+
]
|
| 109 |
+
)
|
| 110 |
+
input_ids_split.append(
|
| 111 |
+
question_ids
|
| 112 |
+
+ input_ids_without_question[
|
| 113 |
+
length_of_group
|
| 114 |
+
* (len(input_ids_without_question) // length_of_group + 1) : len(
|
| 115 |
+
input_ids_without_question
|
| 116 |
+
)
|
| 117 |
+
- 1
|
| 118 |
+
]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
scores = []
|
| 122 |
for input in input_ids_split:
|
| 123 |
+
# set Segment IDs
|
| 124 |
+
# Search the input_ids for the first instance of the `[SEP]` token.
|
| 125 |
sep_index = input.index(tokenizer.sep_token_id)
|
|
|
|
|
|
|
| 126 |
num_seg_a = sep_index + 1
|
| 127 |
+
segment_ids = [0] * num_seg_a + [1] * (len(input) - num_seg_a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
assert len(segment_ids) == len(input)
|
| 129 |
|
| 130 |
+
# evaulate the model
|
| 131 |
+
outputs = model(
|
| 132 |
+
torch.tensor([input]), # The tokens representing our input text.
|
| 133 |
+
token_type_ids=torch.tensor(
|
| 134 |
+
[segment_ids]
|
| 135 |
+
), # The segment IDs to differentiate question from answer_text
|
| 136 |
+
return_dict=True,
|
| 137 |
+
)
|
| 138 |
|
| 139 |
start_scores = outputs.start_logits
|
| 140 |
end_scores = outputs.end_logits
|
|
|
|
| 145 |
print(max_start_score)
|
| 146 |
print(max_end_score)
|
| 147 |
|
| 148 |
+
# reconstruct answer from the tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
| 150 |
+
answer = tokens[torch.argmax(start_scores)]
|
| 151 |
|
| 152 |
+
for i in range(torch.argmax(start_scores) + 1, torch.argmax(end_scores) + 1):
|
| 153 |
+
if tokens[i][0:2] == "##":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
answer += tokens[i][2:]
|
|
|
|
|
|
|
| 155 |
else:
|
| 156 |
+
answer += " " + tokens[i]
|
|
|
|
| 157 |
scores.append((max_start_score, max_end_score, answer))
|
| 158 |
|
| 159 |
# Compare scores for answers found and each paragraph and pick the most relevant.
|
| 160 |
+
return max(scores, key=lambda x: x[0] + x[1])[2]
|
| 161 |
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# =====[ DEFINE INTERFACE ]===== #'
|
| 164 |
title = "Azza Knowledge Agent"
|
| 165 |
+
examples = [["Where is the Eiffel Tower?"], ["What is the population of France?"]]
|
|
|
|
|
|
|
|
|
|
| 166 |
demo = gr.Interface(
|
| 167 |
+
title=title,
|
|
|
|
| 168 |
fn=answer_question,
|
| 169 |
+
inputs="text",
|
| 170 |
+
outputs="text",
|
| 171 |
examples=examples,
|
| 172 |
allow_flagging="never",
|
| 173 |
+
)
|
| 174 |
|
| 175 |
if __name__ == "__main__":
|
| 176 |
+
demo.launch()
|