Commit
·
9336d99
1
Parent(s):
916d4c2
Updated
Browse files
app.py
CHANGED
|
@@ -534,17 +534,24 @@ def vary_sentence_structure(
|
|
| 534 |
return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
|
| 535 |
|
| 536 |
doc = nlp(text)
|
| 537 |
-
|
| 538 |
modified = []
|
| 539 |
|
| 540 |
-
for
|
|
|
|
| 541 |
words = sent.split()
|
| 542 |
-
|
| 543 |
# SPLIT
|
| 544 |
if len(words) > min_split_length and random.random() < split_prob:
|
| 545 |
-
|
| 546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
|
|
|
|
| 548 |
if split_positions:
|
| 549 |
sp = random.choice(split_positions)
|
| 550 |
tokens = list(nlp(sent))
|
|
|
|
| 534 |
return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
|
| 535 |
|
| 536 |
doc = nlp(text)
|
| 537 |
+
doc_sents = list(doc.sents) # real spaCy sentence spans
|
| 538 |
modified = []
|
| 539 |
|
| 540 |
+
for idx, sent_span in enumerate(doc_sents):
|
| 541 |
+
sent = sent_span.text.strip()
|
| 542 |
words = sent.split()
|
| 543 |
+
|
| 544 |
# SPLIT
|
| 545 |
if len(words) > min_split_length and random.random() < split_prob:
|
| 546 |
+
tokens = list(sent_span) # tokens inside this sentence span
|
| 547 |
+
|
| 548 |
+
# find split points inside sentence (no sentence-start confusion)
|
| 549 |
+
split_positions = [
|
| 550 |
+
j for j, tok in enumerate(tokens)
|
| 551 |
+
if tok.dep_ in ("cc", "mark") # coordinating conj / subordinate clause marker
|
| 552 |
+
]
|
| 553 |
|
| 554 |
+
|
| 555 |
if split_positions:
|
| 556 |
sp = random.choice(split_positions)
|
| 557 |
tokens = list(nlp(sent))
|