Spaces:
Runtime error
Runtime error
Commit
·
69f90b2
1
Parent(s):
92212fb
working on text splitting
Browse files
app.py
CHANGED
|
@@ -98,20 +98,21 @@ def main() -> None:
|
|
| 98 |
# return tuple(summarizer.abstractive_summary(list(summary_sentence)))
|
| 99 |
|
| 100 |
def split_text(text: str) -> list:
|
| 101 |
-
sentences = sent_tokenize(text)
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
| 115 |
|
| 116 |
pipe = create_pipeline()
|
| 117 |
|
|
|
|
| 98 |
# return tuple(summarizer.abstractive_summary(list(summary_sentence)))
|
| 99 |
|
| 100 |
def split_text(text: str) -> list:
|
| 101 |
+
sentences = sent_tokenize(text, language="english")
|
| 102 |
+
|
| 103 |
+
token_count = 0
|
| 104 |
+
text_block = ""
|
| 105 |
+
result = []
|
| 106 |
+
for sentence in sentences:
|
| 107 |
+
tokens = word_tokenize(sentence, language="english", preserve_line=True)
|
| 108 |
+
if token_count + len(tokens) < 500:
|
| 109 |
+
token_count += len(tokens)
|
| 110 |
+
text_block += " ".join(sentence)
|
| 111 |
+
else:
|
| 112 |
+
result.append(text_block)
|
| 113 |
+
text_block = "".join(sentence)
|
| 114 |
+
token_count = len(tokens)
|
| 115 |
+
return result
|
| 116 |
|
| 117 |
pipe = create_pipeline()
|
| 118 |
|