Spaces:
Runtime error
Runtime error
Create a version for summarization using LangChain tools.
Browse files- functions.py +46 -7
functions.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
import os
|
| 2 |
-
import requests
|
| 3 |
import random
|
|
|
|
|
|
|
| 4 |
import torch
|
| 5 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from peft import PeftConfig, PeftModel
|
| 7 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer,
|
| 8 |
-
|
| 9 |
|
| 10 |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 11 |
|
|
@@ -21,6 +26,8 @@ shared = {
|
|
| 21 |
'full_text': None,
|
| 22 |
}
|
| 23 |
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def get_nearest_examples(question: str, k: int):
|
| 26 |
print(['get_nearest_examples', 'start'])
|
|
@@ -81,7 +88,33 @@ def split_text(text: str):
|
|
| 81 |
return lines
|
| 82 |
|
| 83 |
|
| 84 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
print(['summarize_text', 'start'])
|
| 86 |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
|
| 87 |
batch = tokenizer(input_text, return_tensors='pt')
|
|
@@ -145,6 +178,7 @@ def answer_question(question: str):
|
|
| 145 |
max_new_tokens=256,
|
| 146 |
generation_config=generation_config)
|
| 147 |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
|
|
|
| 148 |
print(['answer_question', 'end'])
|
| 149 |
return output
|
| 150 |
|
|
@@ -165,7 +199,7 @@ def load_model(peft_model_id):
|
|
| 165 |
return model, tokenizer
|
| 166 |
|
| 167 |
|
| 168 |
-
def load_embeddings_model(model_ckpt:str):
|
| 169 |
print(['load_embeddings_model', 'start'])
|
| 170 |
print(['load_embeddings_model', 'loading tokenizer'])
|
| 171 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
|
@@ -176,5 +210,10 @@ def load_embeddings_model(model_ckpt:str):
|
|
| 176 |
return model, tokenizer
|
| 177 |
|
| 178 |
|
| 179 |
-
model, tokenizer = load_model(
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import random
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
import torch
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
+
from datasets import Dataset
|
| 8 |
+
from langchain.docstore.document import Document
|
| 9 |
+
from langchain.llms import HuggingFacePipeline
|
| 10 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 11 |
from peft import PeftConfig, PeftModel
|
| 12 |
+
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
|
| 13 |
+
GenerationConfig, pipeline)
|
| 14 |
|
| 15 |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 16 |
|
|
|
|
| 26 |
'full_text': None,
|
| 27 |
}
|
| 28 |
|
| 29 |
+
text_splitter = CharacterTextSplitter()
|
| 30 |
+
|
| 31 |
|
| 32 |
def get_nearest_examples(question: str, k: int):
|
| 33 |
print(['get_nearest_examples', 'start'])
|
|
|
|
| 88 |
return lines
|
| 89 |
|
| 90 |
|
| 91 |
+
def remove_prompt(text: str) -> str:
|
| 92 |
+
output_prompt = 'Output: '
|
| 93 |
+
idx = text.index(output_prompt)
|
| 94 |
+
res = text[idx + len(output_prompt):].strip()
|
| 95 |
+
res = res.replace('Input: ', '')
|
| 96 |
+
return res
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def summarize_text(text: str) -> str:
|
| 100 |
+
print(['summarize_text', 'start'])
|
| 101 |
+
|
| 102 |
+
print(['summarize_text', 'splitting text'])
|
| 103 |
+
texts = text_splitter.split_text(text)
|
| 104 |
+
docs = [Document(page_content=t) for t in texts]
|
| 105 |
+
prompts = [f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {d.page_content}\nOutput: '
|
| 106 |
+
for d in docs]
|
| 107 |
+
|
| 108 |
+
print(['summarize_text', 'generating'])
|
| 109 |
+
cleaned_summaries = [remove_prompt(
|
| 110 |
+
s['generated_text']) for s in pipe(prompts)]
|
| 111 |
+
summaries = '\n\n'.join(cleaned_summaries)
|
| 112 |
+
|
| 113 |
+
print(['summarize_text', 'end'])
|
| 114 |
+
return summaries
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def summarize_text_v1(text: str):
|
| 118 |
print(['summarize_text', 'start'])
|
| 119 |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
|
| 120 |
batch = tokenizer(input_text, return_tensors='pt')
|
|
|
|
| 178 |
max_new_tokens=256,
|
| 179 |
generation_config=generation_config)
|
| 180 |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
| 181 |
+
output = output.replace(input_text, '')
|
| 182 |
print(['answer_question', 'end'])
|
| 183 |
return output
|
| 184 |
|
|
|
|
| 199 |
return model, tokenizer
|
| 200 |
|
| 201 |
|
| 202 |
+
def load_embeddings_model(model_ckpt: str):
|
| 203 |
print(['load_embeddings_model', 'start'])
|
| 204 |
print(['load_embeddings_model', 'loading tokenizer'])
|
| 205 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
|
|
|
| 210 |
return model, tokenizer
|
| 211 |
|
| 212 |
|
| 213 |
+
model, tokenizer = load_model(
|
| 214 |
+
"hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
|
| 215 |
+
pipe = pipeline("text2text-generation", model=model,
|
| 216 |
+
tokenizer=tokenizer, max_new_tokens=100)
|
| 217 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
| 218 |
+
emb_model, emb_tokenizer = load_embeddings_model(
|
| 219 |
+
"sentence-transformers/multi-qa-mpnet-base-dot-v1")
|