Spaces:
Runtime error
Runtime error
Commit
·
cb047cb
1
Parent(s):
eda5d31
add summary cleaning function
Browse files
app.py
CHANGED
|
@@ -25,10 +25,19 @@ def main() -> None:
|
|
| 25 |
tos_pipeline = pipeline(task="summarization",
|
| 26 |
model="ML-unipi/bart-large-tos",
|
| 27 |
tokenizer="ML-unipi/bart-large-tos",
|
| 28 |
-
device=0
|
| 29 |
)
|
| 30 |
return tos_pipeline
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def display_summary(summary_sentences: list) -> None:
|
| 33 |
st.subheader("Summary :male-detective:")
|
| 34 |
for sentence in summary_sentences:
|
|
@@ -56,7 +65,6 @@ def main() -> None:
|
|
| 56 |
cumulative_token_length = 0
|
| 57 |
|
| 58 |
for sentence in sentences:
|
| 59 |
-
# token_list = [token for token in nltk.word_tokenize(sentence)]
|
| 60 |
token_list = tokenizer(sentence, max_length=1024, truncation=True)
|
| 61 |
token_length = len(token_list["input_ids"])
|
| 62 |
if token_length > 10:
|
|
@@ -103,16 +111,9 @@ def main() -> None:
|
|
| 103 |
split_token_length=1024
|
| 104 |
)
|
| 105 |
for sentence in sentences:
|
| 106 |
-
# token_list = [token for token in nltk.word_tokenize(sentence)]
|
| 107 |
-
# st.markdown(sentence)
|
| 108 |
-
# st.markdown(str(len(token_list)))
|
| 109 |
output = pipe(sentence)
|
| 110 |
summary = output[0]["summary_text"]
|
| 111 |
-
|
| 112 |
-
for line in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', summary):
|
| 113 |
-
if line.find(".") != -1:
|
| 114 |
-
line = line.replace("..", ".")
|
| 115 |
-
summary_sentences.append(line)
|
| 116 |
display_summary(summary_sentences)
|
| 117 |
|
| 118 |
|
|
|
|
| 25 |
tos_pipeline = pipeline(task="summarization",
|
| 26 |
model="ML-unipi/bart-large-tos",
|
| 27 |
tokenizer="ML-unipi/bart-large-tos",
|
|
|
|
| 28 |
)
|
| 29 |
return tos_pipeline
|
| 30 |
|
| 31 |
+
def clean_summaries(text: str) -> list:
|
| 32 |
+
result = []
|
| 33 |
+
lines = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
|
| 34 |
+
for line in lines:
|
| 35 |
+
if line.find(".") != -1:
|
| 36 |
+
line = line.replace("..", ".")
|
| 37 |
+
result.append(line)
|
| 38 |
+
return result
|
| 39 |
+
|
| 40 |
+
|
| 41 |
def display_summary(summary_sentences: list) -> None:
|
| 42 |
st.subheader("Summary :male-detective:")
|
| 43 |
for sentence in summary_sentences:
|
|
|
|
| 65 |
cumulative_token_length = 0
|
| 66 |
|
| 67 |
for sentence in sentences:
|
|
|
|
| 68 |
token_list = tokenizer(sentence, max_length=1024, truncation=True)
|
| 69 |
token_length = len(token_list["input_ids"])
|
| 70 |
if token_length > 10:
|
|
|
|
| 111 |
split_token_length=1024
|
| 112 |
)
|
| 113 |
for sentence in sentences:
|
|
|
|
|
|
|
|
|
|
| 114 |
output = pipe(sentence)
|
| 115 |
summary = output[0]["summary_text"]
|
| 116 |
+
summary_sentences += clean_summaries(summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
display_summary(summary_sentences)
|
| 118 |
|
| 119 |
|