Spaces:
Runtime error
Runtime error
Commit ·
0ad143b
1
Parent(s): 6ebd1a5
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,7 +13,7 @@ import yake
|
|
| 13 |
from summarizer import Summarizer,TransformerSummarizer
|
| 14 |
from transformers import pipelines
|
| 15 |
nltk.download('punkt')
|
| 16 |
-
from transformers import AutoTokenizer, AutoModelForPreTraining
|
| 17 |
# model_name = 'distilbert-base-uncased'
|
| 18 |
model_name = 'nlpaueb/legal-bert-base-uncased'
|
| 19 |
#model_name = 'laxya007/gpt2_legal'
|
|
@@ -29,13 +29,52 @@ print('Using model {}\n'.format(model_name))
|
|
| 29 |
|
| 30 |
|
| 31 |
|
| 32 |
-
def
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
iface = gr.Interface(
|
| 38 |
-
|
| 39 |
"text",
|
| 40 |
"text"
|
| 41 |
)
|
|
|
|
| 13 |
from summarizer import Summarizer,TransformerSummarizer
|
| 14 |
from transformers import pipelines
|
| 15 |
nltk.download('punkt')
|
| 16 |
+
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig
|
| 17 |
# model_name = 'distilbert-base-uncased'
|
| 18 |
model_name = 'nlpaueb/legal-bert-base-uncased'
|
| 19 |
#model_name = 'laxya007/gpt2_legal'
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
|
| 32 |
+
def lincoln(content = input_text):
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
summary_text = ""
|
| 36 |
+
for i, paragraph in enumerate(content.split("\n\n")):
|
| 37 |
+
# get rid of empty paragraphs and one word paras and extra whitespaces
|
| 38 |
+
paragraph = paragraph.replace('\n',' ')
|
| 39 |
+
paragraph = paragraph.replace('\t','')
|
| 40 |
+
paragraph = ' '.join(paragraph.split())
|
| 41 |
+
# count words in the paragraph and exclude if less than 4 words
|
| 42 |
+
tokens = word_tokenize(paragraph)
|
| 43 |
+
# only do real words
|
| 44 |
+
tokens = [word for word in tokens if word.isalpha()]
|
| 45 |
+
# print("\nTokens: {}\n".format(len(tokens)))
|
| 46 |
+
# only do sentences with more than 1 words excl. alpha crap
|
| 47 |
+
if len(tokens) <= 1:
|
| 48 |
+
continue
|
| 49 |
+
# Perhaps also ignore paragraphs with no sentence?
|
| 50 |
+
sentences = sent_tokenize(paragraph)
|
| 51 |
+
|
| 52 |
+
# recreate paragraph from the only words tokens list
|
| 53 |
+
paragraph = ' '.join(tokens)
|
| 54 |
+
|
| 55 |
+
print("\nParagraph:")
|
| 56 |
+
print(paragraph+"\n")
|
| 57 |
+
# T5 needs to have 'summarize' in order to work:
|
| 58 |
+
# text = "summarize:" + paragraph
|
| 59 |
+
text = paragraph
|
| 60 |
+
# encoding the input text
|
| 61 |
+
|
| 62 |
+
summary = bert_legal_model(content, ratio = 0.01)
|
| 63 |
+
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
|
| 64 |
+
summary_text += str(summary) + "\n\n"
|
| 65 |
+
print("Summary:")
|
| 66 |
+
print(summary)
|
| 67 |
+
|
| 68 |
+
summary = bert_legal_model(content, ratio=0.1)
|
| 69 |
+
|
| 70 |
+
all_text = str(summary) + "\n\n\n" \
|
| 71 |
+
+ "-------- The Larger Summary --------\n" + str(summary_text)
|
| 72 |
+
|
| 73 |
+
return output_text = all_text
|
| 74 |
+
|
| 75 |
|
| 76 |
iface = gr.Interface(
|
| 77 |
+
lincoln,
|
| 78 |
"text",
|
| 79 |
"text"
|
| 80 |
)
|