onkar-waghmode commited on
Commit
9336d99
·
1 Parent(s): 916d4c2
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -534,17 +534,24 @@ def vary_sentence_structure(
534
  return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
535
 
536
  doc = nlp(text)
537
- sents = [s.text.strip() for s in doc.sents]
538
  modified = []
539
 
540
- for sent in sents:
 
541
  words = sent.split()
542
-
543
  # SPLIT
544
  if len(words) > min_split_length and random.random() < split_prob:
545
- split_positions = [tok.i - doc[list(doc.sents).index(sent)].start
546
- for tok in nlp(sent) if tok.dep_ in ("cc", "mark")]
 
 
 
 
 
547
 
 
548
  if split_positions:
549
  sp = random.choice(split_positions)
550
  tokens = list(nlp(sent))
 
534
  return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
535
 
536
  doc = nlp(text)
537
+ doc_sents = list(doc.sents) # real spaCy sentence spans
538
  modified = []
539
 
540
+ for idx, sent_span in enumerate(doc_sents):
541
+ sent = sent_span.text.strip()
542
  words = sent.split()
543
+
544
  # SPLIT
545
  if len(words) > min_split_length and random.random() < split_prob:
546
+ tokens = list(sent_span) # tokens inside this sentence span
547
+
548
+ # find split points inside sentence (no sentence-start confusion)
549
+ split_positions = [
550
+ j for j, tok in enumerate(tokens)
551
+ if tok.dep_ in ("cc", "mark") # coordinating conj / subordinate clause marker
552
+ ]
553
 
554
+
555
  if split_positions:
556
  sp = random.choice(split_positions)
557
  tokens = list(nlp(sent))