txt / modules /language_processing /language_components.py
Garcia Estrada
gspread
5383e68
from nltk import sent_tokenize, word_tokenize
import re
import openai
import gspread
class Sentence:
def __init__(self,text,error_message,suggestions):
self.text = text
self.length = len(text)
self.error_message = error_message
self.suggestions = suggestions
class LanguageComponentSplitter:
def remove_unwanted_characters(self,text):
return re.sub('\s+',' ',text)
def split_sentences(self,text):
text_clean = self.remove_unwanted_characters(text)
sentences = sent_tokenize(text_clean)
return sentences
class Summarization_task:
def summarize_gpt(self, text):
openai.api_key = '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'
response = openai.Completion.create(
model="text-davinci-003",
prompt="Pretend you are a legal copywriter give me a list of the errors you find and then provide a correction in a list, only use spanish: " + text,
temperature=0.7,
max_tokens=1000,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0
)
return response["choices"][0]["text"]
#To be implemented when API key is obtained.
def summarize_bard(self,text):
token = ''
bard = Bard(token=token)
source_text = ''
target_text = ''
prompt = "Pretend you are a legal copywriter give me a list of the writing and problems with quotes and then provide a correction, only use spanish: " + text
translated = bard.get_answer(prompt)['content']
return translated
class Utilities:
def specialized_words(self,sentence):
words = word_tokenize(sentence)
return words
def connect_to_cloud(self):
gc = gspread.service_account(filename="creds.json")
sh = gc.open("entrenamiento_txt_jur").sheet1
return sh
def write_on_sheet(self,text_raw,text_proc):
sh = self.connect_to_cloud()
next_row = self.find_next_available_cell(sh)
sh.update("A{}".format(next_row),text_raw)
sh.update("B{}".format(next_row),text_proc)
return "Printed on db"
def find_next_available_cell(self,sh):
str_list = list(filter(None, sh.col_values(1)))
return str(len(str_list) + 1)