csvaldellon commited on
Commit
f0c8e2c
·
1 Parent(s): a794ddd

feat: initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +19 -0
  2. app/main.py +45 -0
  3. app/utils.py +196 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8.12-slim-buster
2
+
3
+ # Install wget and required pip libraries
4
+ RUN apt-get update &&\
5
+ apt-get install -y --no-install-recommends wget &&\
6
+ rm -rf /var/lib/apt/lists/* &&\
7
+ pip install --no-cache-dir transformers[torch] uvicorn fastapi
8
+
9
+ # adds the script defining the QA model to docker
10
+ COPY download_model.sh .
11
+
12
+ # Downloads the required QA model
13
+ RUN bash download_model.sh
14
+
15
+ # copies the app files to the docker image
16
+ COPY app/ app/
17
+
18
+ # runs our application at the start of the docker image
19
+ CMD ["python", "app/main.py"]
app/main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI, Request
3
+ from utils import QASearcher
4
+
5
+ app = FastAPI()
6
+
7
+
8
+ @app.post("/set_context")
9
+ async def set_context(data: Request):
10
+ """
11
+ Fastapi POST method that sets the QA context for search.
12
+
13
+ Args:
14
+ data(`dict`): Two fields required 'questions' (`list` of `str`)
15
+ and 'answers' (`list` of `str`)
16
+ """
17
+ data = await data.json()
18
+
19
+ qa_search.set_context_qa(data["questions"], data["answers"])
20
+ return {"message": "Search context set"}
21
+
22
+
23
+ @app.post("/get_answer")
24
+ async def get_answer(data: Request):
25
+ """
26
+ Fastapi POST method that gets the best question and answer
27
+ in the set context.
28
+
29
+ Args:
30
+ data(`dict`): One field required 'questions' (`list` of `str`)
31
+
32
+ Returns:
33
+ A `dict` containing the original question ('orig_q'), the most similar
34
+ question in the context ('best_q') and the associated answer ('best_a').
35
+ """
36
+ data = await data.json()
37
+
38
+ response = qa_search.get_answers(data["questions"], batch=1)
39
+ return response
40
+
41
+
42
+ # initialises the QA model and starts the uvicorn app
43
+ if __name__ == "__main__":
44
+ qa_search = QASearcher()
45
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/utils.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModel, AutoTokenizer
3
+
4
+
5
+ class QAEmbedder:
6
+ def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
7
+ """
8
+ Defines a QA embedding model. This is, given a set of questions,
9
+ this class returns the corresponding embedding vectors.
10
+
11
+ Args:
12
+ model_name (`str`): Directory containing the necessary tokenizer
13
+ and model files.
14
+ """
15
+ self.model = None
16
+ self.tokenizer = None
17
+ self.model_name = model_name
18
+ self.set_model(model_name)
19
+
20
+ def get_model(self, model_name):
21
+ """
22
+ Loads a general tokenizer and model using pytorch
23
+ 'AutoTokenizer' and 'AutoModel'
24
+
25
+ Args:
26
+ model_name (`str`): Directory containing the necessary tokenizer
27
+ and model files.
28
+ """
29
+ model = AutoModel.from_pretrained(model_name)
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ return model, tokenizer
32
+
33
+ def set_model(self, model_name):
34
+ """
35
+ Sets a general tokenizer and model using the 'self.get_model'
36
+ method.
37
+
38
+ Args:
39
+ model_name (`str`): Directory containing the necessary tokenizer
40
+ and model files.
41
+ """
42
+ self.model, self.tokenizer = self.get_model(self.model_name)
43
+
44
+ def _mean_pooling(self, model_output, attention_mask):
45
+ """
46
+ Internal method that takes a model output and an attention
47
+ mask and outputs a mean pooling layer.
48
+
49
+ Args:
50
+ model_output (`torch.Tensor`): output from the QA model
51
+ attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer
52
+
53
+ Returns:
54
+ The averaged tensor.
55
+ """
56
+ token_embeddings = model_output[0]
57
+
58
+ input_mask_expanded = (
59
+ attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
60
+ )
61
+
62
+ pool_emb = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
63
+ input_mask_expanded.sum(1), min=1e-9
64
+ )
65
+
66
+ return pool_emb
67
+
68
+ def get_embeddings(self, questions, batch=32):
69
+ """
70
+ Gets the corresponding embeddings for a set of input 'questions'.
71
+
72
+ Args:
73
+ questions (`list` of `str`): List of strings defining the questions to be embedded
74
+ batch (`int`): Performs the embedding job 'batch' questions at a time
75
+
76
+ Returns:
77
+ The embedding vectors.
78
+ """
79
+ question_embeddings = []
80
+ for i in range(0, len(questions), batch):
81
+
82
+ # Tokenize sentences
83
+ encoded_input = self.tokenizer(
84
+ questions[i : i + batch],
85
+ padding=True,
86
+ truncation=True,
87
+ return_tensors="pt",
88
+ )
89
+
90
+ # Compute token embeddings
91
+ with torch.no_grad():
92
+ model_output = self.model(**encoded_input)
93
+
94
+ # Perform mean pooling
95
+ batch_embeddings = self._mean_pooling(
96
+ model_output, encoded_input["attention_mask"]
97
+ )
98
+ question_embeddings.append(batch_embeddings)
99
+
100
+ question_embeddings = torch.cat(question_embeddings, dim=0)
101
+ return question_embeddings
102
+
103
+
104
+ class QASearcher:
105
+ def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
106
+ """
107
+ Defines a QA Search model. This is, given a new question it searches
108
+ the most similar questions in a set 'context' and returns both the best
109
+ question and associated answer.
110
+
111
+ Args:
112
+ model_name (`str`): Directory containing the necessary tokenizer
113
+ and model files.
114
+ """
115
+ self.answers = None
116
+ self.questions = None
117
+ self.question_embeddings = None
118
+ self.embedder = QAEmbedder(model_name=model_name)
119
+
120
+ def set_context_qa(self, questions, answers):
121
+ """
122
+ Sets the QA context to be used during search.
123
+
124
+ Args:
125
+ questions (`list` of `str`): List of strings defining the questions to be embedded
126
+ answers (`list` of `str`): Best answer for each question in 'questions'
127
+ """
128
+ self.answers = answers
129
+ self.questions = questions
130
+ self.question_embeddings = self.get_q_embeddings(questions)
131
+
132
+ def get_q_embeddings(self, questions):
133
+ """
134
+ Gets the embeddings for the questions in 'context'.
135
+
136
+ Args:
137
+ questions (`list` of `str`): List of strings defining the questions to be embedded
138
+
139
+ Returns:
140
+ The embedding vectors.
141
+ """
142
+ question_embeddings = self.embedder.get_embeddings(questions)
143
+ question_embeddings = torch.nn.functional.normalize(
144
+ question_embeddings, p=2, dim=1
145
+ )
146
+ return question_embeddings.transpose(0, 1)
147
+
148
+ def cosine_similarity(self, questions, batch=32):
149
+ """
150
+ Gets the cosine similarity between the new questions and the 'context' questions.
151
+
152
+ Args:
153
+ questions (`list` of `str`): List of strings defining the questions to be embedded
154
+ batch (`int`): Performs the embedding job 'batch' questions at a time
155
+
156
+ Returns:
157
+ The cosine similarity
158
+ """
159
+ question_embeddings = self.embedder.get_embeddings(questions, batch=batch)
160
+ question_embeddings = torch.nn.functional.normalize(
161
+ question_embeddings, p=2, dim=1
162
+ )
163
+
164
+ cosine_sim = torch.mm(question_embeddings, self.question_embeddings)
165
+
166
+ return cosine_sim
167
+
168
+ def get_answers(self, questions, batch=32):
169
+ """
170
+ Gets the best answers in the stored 'context' for the given new 'questions'.
171
+
172
+ Args:
173
+ questions (`list` of `str`): List of strings defining the questions to be embedded
174
+ batch (`int`): Performs the embedding job 'batch' questions at a time
175
+
176
+ Returns:
177
+ A `list` of `dict`'s containing the original question ('orig_q'), the most similar
178
+ question in the context ('best_q') and the associated answer ('best_a').
179
+ """
180
+ similarity = self.cosine_similarity(questions, batch=batch)
181
+
182
+ response = []
183
+ for i in range(similarity.shape[0]):
184
+ best_ix = similarity[i].argmax()
185
+ best_q = self.questions[best_ix]
186
+ best_a = self.answers[best_ix]
187
+
188
+ response.append(
189
+ {
190
+ "orig_q": questions[i],
191
+ "best_q": best_q,
192
+ "best_a": best_a,
193
+ }
194
+ )
195
+
196
+ return response