humanizer

Running

onkar-waghmode commited on Nov 7

Commit

9336d99

1 Parent(s): 916d4c2

Updated

Files changed (1) hide show

app.py CHANGED Viewed

@@ -534,17 +534,24 @@ def vary_sentence_structure(
         return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
     doc = nlp(text)
-    sents = [s.text.strip() for s in doc.sents]
     modified = []
-    for sent in sents:
         words = sent.split()
         # SPLIT
         if len(words) > min_split_length and random.random() < split_prob:
-            split_positions = [tok.i - doc[list(doc.sents).index(sent)].start
-                               for tok in nlp(sent) if tok.dep_ in ("cc", "mark")]
             if split_positions:
                 sp = random.choice(split_positions)
                 tokens = list(nlp(sent))

         return f"{connector.capitalize()}, {curr[0].lower() + curr[1:]}"
     doc = nlp(text)
+    doc_sents = list(doc.sents)  # real spaCy sentence spans
     modified = []
+    for idx, sent_span in enumerate(doc_sents):
+        sent = sent_span.text.strip()
         words = sent.split()
         # SPLIT
         if len(words) > min_split_length and random.random() < split_prob:
+            tokens = list(sent_span)  # tokens inside this sentence span
+            # find split points inside sentence (no sentence-start confusion)
+            split_positions = [
+                j for j, tok in enumerate(tokens)
+                if tok.dep_ in ("cc", "mark")     # coordinating conj / subordinate clause marker
+            ]
             if split_positions:
                 sp = random.choice(split_positions)
                 tokens = list(nlp(sent))