Spaces:
Sleeping
Sleeping
discard short sentences
Browse files- app.py +18 -1
- lexrank.py +1 -1
app.py
CHANGED
|
@@ -17,6 +17,24 @@ def summarize(in_text):
|
|
| 17 |
print("downloading punkt file")
|
| 18 |
nltk.download('punkt')
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
target_tokens = 1024
|
| 21 |
|
| 22 |
in_sents = metrics.num_sentences(in_text)
|
|
@@ -25,7 +43,6 @@ def summarize(in_text):
|
|
| 25 |
n_tokens= metrics.num_tokens(out_text)
|
| 26 |
prev_n_tokens=0
|
| 27 |
for sen in range(2, in_sents):
|
| 28 |
-
#print(sen,in_sents,n_tokens)
|
| 29 |
if n_tokens >= target_tokens:
|
| 30 |
n_tokens = prev_n_tokens
|
| 31 |
break
|
|
|
|
| 17 |
print("downloading punkt file")
|
| 18 |
nltk.download('punkt')
|
| 19 |
|
| 20 |
+
in_longtext = []
|
| 21 |
+
# Discard all senteces that have less than 10 words in them
|
| 22 |
+
in_text_sentenses = in_text.split('.')
|
| 23 |
+
print(in_text_sentenses)
|
| 24 |
+
for sen in in_text_sentenses:
|
| 25 |
+
print(sen)
|
| 26 |
+
print(len(sen.split()))
|
| 27 |
+
if len(sen.split()) > 10:
|
| 28 |
+
in_longtext.append(sen)
|
| 29 |
+
in_text = '.'.join(in_longtext)+'.'
|
| 30 |
+
print('strip')
|
| 31 |
+
print(in_text)
|
| 32 |
+
|
| 33 |
+
# The size of the summary is limited to 1024
|
| 34 |
+
# The Lexrank algorith accepts only sentences as a limit
|
| 35 |
+
# We start with one sentece and check the token size
|
| 36 |
+
# Then increase the number of sentences until the tokensize
|
| 37 |
+
# of the next sentence exceed the limit
|
| 38 |
target_tokens = 1024
|
| 39 |
|
| 40 |
in_sents = metrics.num_sentences(in_text)
|
|
|
|
| 43 |
n_tokens= metrics.num_tokens(out_text)
|
| 44 |
prev_n_tokens=0
|
| 45 |
for sen in range(2, in_sents):
|
|
|
|
| 46 |
if n_tokens >= target_tokens:
|
| 47 |
n_tokens = prev_n_tokens
|
| 48 |
break
|
lexrank.py
CHANGED
|
@@ -23,7 +23,7 @@ def get_Summary(in_text, nr_sentences):
|
|
| 23 |
# all items from the lexrank summary must be concatinated and split up by full stops.
|
| 24 |
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
|
| 25 |
concat_list_summary = concat_list_summary.replace('\\n','')
|
| 26 |
-
concat_list_summary = concat_list_summary.replace('. ','.\n')
|
| 27 |
|
| 28 |
return concat_list_summary
|
| 29 |
|
|
|
|
| 23 |
# all items from the lexrank summary must be concatinated and split up by full stops.
|
| 24 |
concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ')
|
| 25 |
concat_list_summary = concat_list_summary.replace('\\n','')
|
| 26 |
+
concat_list_summary = concat_list_summary.replace('. ','.\n')+'.'
|
| 27 |
|
| 28 |
return concat_list_summary
|
| 29 |
|