Spaces:
Running
Running
Upload 3 files
Browse files
transformers_rec/transformers_recognizer.py
CHANGED
|
@@ -224,14 +224,18 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 224 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
| 225 |
# calculate inputs based on the text
|
| 226 |
text_length = len(text)
|
| 227 |
-
# split text into chunks
|
| 228 |
-
logger.info(
|
| 229 |
-
f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
|
| 230 |
-
)
|
| 231 |
predictions = list()
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
# iterate over text chunks and run inference
|
| 237 |
for chunk_start, chunk_end in chunk_indexes:
|
|
|
|
| 224 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
| 225 |
# calculate inputs based on the text
|
| 226 |
text_length = len(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
predictions = list()
|
| 228 |
+
if text_length > model_max_length*2:
|
| 229 |
+
# split text into chunks
|
| 230 |
+
logger.info(
|
| 231 |
+
f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
|
| 235 |
+
text_length, self.chunk_length, self.text_overlap_length
|
| 236 |
+
)
|
| 237 |
+
else:
|
| 238 |
+
chunk_indexes = [[0, text_length]]
|
| 239 |
|
| 240 |
# iterate over text chunks and run inference
|
| 241 |
for chunk_start, chunk_end in chunk_indexes:
|