Spaces:

arithescientist
/

summarizer

Runtime error

App Files Files Community

arithescientist commited on Jun 1, 2022

Commit

0ad143b

1 Parent(s): 6ebd1a5

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -6

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ import yake
 from summarizer import Summarizer,TransformerSummarizer
 from transformers import pipelines
 nltk.download('punkt')
-from transformers import AutoTokenizer, AutoModelForPreTraining
 # model_name = 'distilbert-base-uncased'
 model_name = 'nlpaueb/legal-bert-base-uncased'
 #model_name = 'laxya007/gpt2_legal'
@@ -29,13 +29,52 @@ print('Using model {}\n'.format(model_name))
-def get_response(input_text):
-    output_text= bert_legal_model(input_text,  min_length = 8, ratio = 0.05)
-    return output_text
 iface = gr.Interface(
-    get_response,
     "text",
     "text"
    )

 from summarizer import Summarizer,TransformerSummarizer
 from transformers import pipelines
 nltk.download('punkt')
+from transformers import AutoTokenizer, AutoModelForPreTraining, AutoConfig
 # model_name = 'distilbert-base-uncased'
 model_name = 'nlpaueb/legal-bert-base-uncased'
 #model_name = 'laxya007/gpt2_legal'
+def lincoln(content = input_text):
+  summary_text = ""
+  for i, paragraph in enumerate(content.split("\n\n")):
+      # get rid of empty paragraphs and one word paras and extra whitespaces
+      paragraph = paragraph.replace('\n',' ')
+      paragraph = paragraph.replace('\t','')
+      paragraph = ' '.join(paragraph.split())
+      # count words in the paragraph and exclude if less than 4 words
+      tokens = word_tokenize(paragraph)
+      # only do real words
+      tokens = [word for word in tokens if word.isalpha()]
+      # print("\nTokens: {}\n".format(len(tokens)))
+      # only do sentences with more than 1 words excl. alpha crap
+      if len(tokens) <= 1:
+          continue
+      # Perhaps also ignore paragraphs with no sentence?
+      sentences = sent_tokenize(paragraph)
+      # recreate paragraph from the only words tokens list
+      paragraph = ' '.join(tokens)
+      print("\nParagraph:")
+      print(paragraph+"\n")
+      # T5 needs to have 'summarize' in order to work:
+      # text = "summarize:" + paragraph
+      text = paragraph
+      # encoding the input text
+      summary = bert_legal_model(content, ratio = 0.01)
+      # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
+      summary_text += str(summary) + "\n\n"
+      print("Summary:")
+      print(summary)
+  summary = bert_legal_model(content, ratio=0.1)
+  all_text = str(summary) + "\n\n\n" \
+      + "-------- The Larger Summary --------\n" + str(summary_text)
+  return output_text  = all_text
 iface = gr.Interface(
+    lincoln,
     "text",
     "text"
    )