Spaces:
Runtime error
Runtime error
Fix error related to faiss
Browse files- functions.py +19 -19
functions.py
CHANGED
|
@@ -29,8 +29,8 @@ def get_nearest_examples(question: str, k: int):
|
|
| 29 |
scores, samples = embeddings_dataset.get_nearest_examples(
|
| 30 |
"embeddings", question_embedding, k)
|
| 31 |
print(['get_nearest_examples', 'scores and samples'])
|
| 32 |
-
|
| 33 |
-
|
| 34 |
print(['get_nearest_examples', 'end'])
|
| 35 |
return samples
|
| 36 |
|
|
@@ -44,10 +44,6 @@ def get_embeddings(text):
|
|
| 44 |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
|
| 45 |
model_output = emb_model(**encoded_input)
|
| 46 |
model_output = model_output.last_hidden_state[:, 0]
|
| 47 |
-
# print(model_output)
|
| 48 |
-
# Error: AttributeError: 'numpy.ndarray' object has no attribute 'cpu'
|
| 49 |
-
# emb_item = model_output.detach().cpu().numpy()[0]
|
| 50 |
-
# print(emb_item)
|
| 51 |
print(['get_embeddings', 'end'])
|
| 52 |
return model_output
|
| 53 |
|
|
@@ -56,9 +52,11 @@ def build_faiss_index(text):
|
|
| 56 |
print(['build_faiss_index', 'start'])
|
| 57 |
text_list = split_text(text)
|
| 58 |
emb_list = []
|
| 59 |
-
for item in text_list:
|
| 60 |
-
emb_list.append({
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
dataset = Dataset.from_list(emb_list)
|
| 63 |
dataset.add_faiss_index(column="embeddings")
|
| 64 |
shared['embeddings_dataset'] = dataset
|
|
@@ -125,13 +123,18 @@ def get_answer_context():
|
|
| 125 |
|
| 126 |
|
| 127 |
def answer_question(question: str):
|
| 128 |
-
# return ', '.join([len(shared['base_text']), len(question)])
|
| 129 |
print(['answer_question', 'start'])
|
|
|
|
|
|
|
| 130 |
if not shared['embeddings_dataset']:
|
| 131 |
-
build_faiss_index(
|
| 132 |
-
top_k_samples = get_nearest_examples(question, k=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
context = '\n'.join(top_k_samples)
|
| 135 |
|
| 136 |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
|
| 137 |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
|
|
@@ -162,9 +165,8 @@ def load_model(peft_model_id):
|
|
| 162 |
return model, tokenizer
|
| 163 |
|
| 164 |
|
| 165 |
-
def load_embeddings_model():
|
| 166 |
print(['load_embeddings_model', 'start'])
|
| 167 |
-
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
| 168 |
print(['load_embeddings_model', 'loading tokenizer'])
|
| 169 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
| 170 |
print(['load_embeddings_model', 'loading model'])
|
|
@@ -174,7 +176,5 @@ def load_embeddings_model():
|
|
| 174 |
return model, tokenizer
|
| 175 |
|
| 176 |
|
| 177 |
-
model, tokenizer = load_model(
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
emb_model, emb_tokenizer = load_embeddings_model()
|
|
|
|
| 29 |
scores, samples = embeddings_dataset.get_nearest_examples(
|
| 30 |
"embeddings", question_embedding, k)
|
| 31 |
print(['get_nearest_examples', 'scores and samples'])
|
| 32 |
+
print(scores)
|
| 33 |
+
print(samples['id'])
|
| 34 |
print(['get_nearest_examples', 'end'])
|
| 35 |
return samples
|
| 36 |
|
|
|
|
| 44 |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
|
| 45 |
model_output = emb_model(**encoded_input)
|
| 46 |
model_output = model_output.last_hidden_state[:, 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
print(['get_embeddings', 'end'])
|
| 48 |
return model_output
|
| 49 |
|
|
|
|
| 52 |
print(['build_faiss_index', 'start'])
|
| 53 |
text_list = split_text(text)
|
| 54 |
emb_list = []
|
| 55 |
+
for i, item in enumerate(text_list):
|
| 56 |
+
emb_list.append({
|
| 57 |
+
"embeddings": get_embeddings(item).cpu().detach().numpy()[0],
|
| 58 |
+
'id': i
|
| 59 |
+
})
|
| 60 |
dataset = Dataset.from_list(emb_list)
|
| 61 |
dataset.add_faiss_index(column="embeddings")
|
| 62 |
shared['embeddings_dataset'] = dataset
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
def answer_question(question: str):
|
|
|
|
| 126 |
print(['answer_question', 'start'])
|
| 127 |
+
full_text = shared['full_text']
|
| 128 |
+
|
| 129 |
if not shared['embeddings_dataset']:
|
| 130 |
+
build_faiss_index(full_text)
|
| 131 |
+
top_k_samples = get_nearest_examples(question, k=3)
|
| 132 |
+
|
| 133 |
+
index_text = {}
|
| 134 |
+
for i, t in enumerate(split_text(full_text)):
|
| 135 |
+
index_text[i] = t
|
| 136 |
|
| 137 |
+
context = '\n'.join([index_text[id] for id in top_k_samples['id']])
|
| 138 |
|
| 139 |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
|
| 140 |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
|
|
|
|
| 165 |
return model, tokenizer
|
| 166 |
|
| 167 |
|
| 168 |
+
def load_embeddings_model(model_ckpt:str):
|
| 169 |
print(['load_embeddings_model', 'start'])
|
|
|
|
| 170 |
print(['load_embeddings_model', 'loading tokenizer'])
|
| 171 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
| 172 |
print(['load_embeddings_model', 'loading model'])
|
|
|
|
| 176 |
return model, tokenizer
|
| 177 |
|
| 178 |
|
| 179 |
+
model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
|
| 180 |
+
emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
|
|
|
|
|