Spaces:
Runtime error
Runtime error
Store the extracted text to be used during faiss' index creation
Browse files- functions.py +7 -10
functions.py
CHANGED
|
@@ -18,13 +18,9 @@ device = 'cuda'
|
|
| 18 |
shared = {
|
| 19 |
'answer_context': None,
|
| 20 |
'embeddings_dataset': None,
|
| 21 |
-
'
|
| 22 |
}
|
| 23 |
|
| 24 |
-
def store_text_changes(text):
|
| 25 |
-
shared['base_text'] = text
|
| 26 |
-
|
| 27 |
-
|
| 28 |
def get_nearest_examples(question: str, k: int):
|
| 29 |
print(['get_nearest_examples', 'start'])
|
| 30 |
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
|
@@ -69,6 +65,7 @@ def extract_text(url: str):
|
|
| 69 |
response = requests.get(url)
|
| 70 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 71 |
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))
|
|
|
|
| 72 |
print(['extract_text', 'end'])
|
| 73 |
return text
|
| 74 |
|
|
@@ -121,10 +118,10 @@ def get_answer_context():
|
|
| 121 |
|
| 122 |
|
| 123 |
def answer_question(question: str):
|
| 124 |
-
return ', '.join([len(shared['base_text']), len(question)])
|
| 125 |
print(['answer_question', 'start'])
|
| 126 |
if not shared['embeddings_dataset']:
|
| 127 |
-
build_faiss_index(full_text)
|
| 128 |
top_k_samples = get_nearest_examples(question, k=5)
|
| 129 |
|
| 130 |
context = '\n'.join(top_k_samples)
|
|
@@ -170,7 +167,7 @@ def load_embeddings_model():
|
|
| 170 |
return model, tokenizer
|
| 171 |
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
|
|
|
| 18 |
shared = {
|
| 19 |
'answer_context': None,
|
| 20 |
'embeddings_dataset': None,
|
| 21 |
+
'full_text': None,
|
| 22 |
}
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def get_nearest_examples(question: str, k: int):
|
| 25 |
print(['get_nearest_examples', 'start'])
|
| 26 |
question_embedding = get_embeddings([question]).cpu().detach().numpy()
|
|
|
|
| 65 |
response = requests.get(url)
|
| 66 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 67 |
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))
|
| 68 |
+
shared['full_text'] = text
|
| 69 |
print(['extract_text', 'end'])
|
| 70 |
return text
|
| 71 |
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def answer_question(question: str):
|
| 121 |
+
# return ', '.join([len(shared['base_text']), len(question)])
|
| 122 |
print(['answer_question', 'start'])
|
| 123 |
if not shared['embeddings_dataset']:
|
| 124 |
+
build_faiss_index(shared['full_text'])
|
| 125 |
top_k_samples = get_nearest_examples(question, k=5)
|
| 126 |
|
| 127 |
context = '\n'.join(top_k_samples)
|
|
|
|
| 167 |
return model, tokenizer
|
| 168 |
|
| 169 |
|
| 170 |
+
model, tokenizer = load_model(
|
| 171 |
+
"hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
|
| 172 |
|
| 173 |
+
emb_model, emb_tokenizer = load_embeddings_model()
|