Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -352,6 +352,26 @@ class HumanLikeVariations:
|
|
| 352 |
return variation(sentence)
|
| 353 |
except:
|
| 354 |
return sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
class SelectiveGrammarFixer:
|
| 357 |
"""Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
|
|
@@ -642,69 +662,6 @@ class EnhancedDipperHumanizer:
|
|
| 642 |
|
| 643 |
return text
|
| 644 |
|
| 645 |
-
def preserve_keywords(self, text, keywords):
|
| 646 |
-
"""Mark keywords to preserve them during paraphrasing - SIMPLIFIED"""
|
| 647 |
-
if not keywords:
|
| 648 |
-
return text, {}
|
| 649 |
-
|
| 650 |
-
# Create a mapping of placeholders to keywords
|
| 651 |
-
keyword_map = {}
|
| 652 |
-
modified_text = text
|
| 653 |
-
|
| 654 |
-
# Sort keywords by length (longest first) to avoid partial replacements
|
| 655 |
-
sorted_keywords = sorted(keywords, key=len, reverse=True)
|
| 656 |
-
|
| 657 |
-
for i, keyword in enumerate(sorted_keywords):
|
| 658 |
-
# Use unique markers that won't be confused
|
| 659 |
-
placeholder = f"KWPH{i:04d}" # e.g., KWPH0001
|
| 660 |
-
|
| 661 |
-
# Find all occurrences of the keyword (case-insensitive)
|
| 662 |
-
pattern = r'\b' + re.escape(keyword) + r'\b'
|
| 663 |
-
matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
|
| 664 |
-
|
| 665 |
-
if matches:
|
| 666 |
-
# Replace all occurrences with the placeholder
|
| 667 |
-
for match in reversed(matches): # Reverse to maintain positions
|
| 668 |
-
original_keyword = match.group(0)
|
| 669 |
-
start, end = match.span()
|
| 670 |
-
modified_text = modified_text[:start] + placeholder + modified_text[end:]
|
| 671 |
-
# Store the original case version
|
| 672 |
-
if placeholder not in keyword_map:
|
| 673 |
-
keyword_map[placeholder] = original_keyword
|
| 674 |
-
|
| 675 |
-
return modified_text, keyword_map
|
| 676 |
-
|
| 677 |
-
def restore_keywords_robust(self, text, keyword_map):
|
| 678 |
-
"""Restore keywords with simple direct replacement"""
|
| 679 |
-
if not keyword_map:
|
| 680 |
-
return text
|
| 681 |
-
|
| 682 |
-
restored_text = text
|
| 683 |
-
|
| 684 |
-
# Simple direct replacement
|
| 685 |
-
for placeholder, keyword in keyword_map.items():
|
| 686 |
-
# Direct replacement
|
| 687 |
-
restored_text = restored_text.replace(placeholder, keyword)
|
| 688 |
-
|
| 689 |
-
# Also try with potential variations that might occur
|
| 690 |
-
restored_text = restored_text.replace(f" {placeholder} ", f" {keyword} ")
|
| 691 |
-
restored_text = restored_text.replace(f"{placeholder}.", f"{keyword}.")
|
| 692 |
-
restored_text = restored_text.replace(f"{placeholder},", f"{keyword},")
|
| 693 |
-
restored_text = restored_text.replace(f"{placeholder}!", f"{keyword}!")
|
| 694 |
-
restored_text = restored_text.replace(f"{placeholder}?", f"{keyword}?")
|
| 695 |
-
restored_text = restored_text.replace(f"{placeholder}:", f"{keyword}:")
|
| 696 |
-
restored_text = restored_text.replace(f"{placeholder};", f"{keyword};")
|
| 697 |
-
restored_text = restored_text.replace(f"({placeholder})", f"({keyword})")
|
| 698 |
-
restored_text = restored_text.replace(f'"{placeholder}"', f'"{keyword}"')
|
| 699 |
-
restored_text = restored_text.replace(f"'{placeholder}'", f"'{keyword}'")
|
| 700 |
-
|
| 701 |
-
# Handle case variations
|
| 702 |
-
restored_text = restored_text.replace(placeholder.lower(), keyword)
|
| 703 |
-
restored_text = restored_text.replace(placeholder.upper(), keyword)
|
| 704 |
-
restored_text = restored_text.replace(placeholder.capitalize(), keyword)
|
| 705 |
-
|
| 706 |
-
return restored_text.strip()
|
| 707 |
-
|
| 708 |
def should_skip_element(self, element, text):
|
| 709 |
"""Determine if an element should be skipped from paraphrasing"""
|
| 710 |
if not text or len(text.strip()) < 3:
|
|
@@ -895,10 +852,7 @@ class EnhancedDipperHumanizer:
|
|
| 895 |
text = re.sub(r'\s+', ' ', text)
|
| 896 |
|
| 897 |
# Remove leading non-letter characters carefully
|
| 898 |
-
|
| 899 |
-
if not re.match(r'^(KWPH\d+)', text):
|
| 900 |
-
# Only remove if it doesn't start with a placeholder
|
| 901 |
-
text = re.sub(r'^[^a-zA-Z_]+', '', text)
|
| 902 |
|
| 903 |
# If we accidentally removed too much, use original
|
| 904 |
if len(text) < len(original) * 0.5:
|
|
@@ -906,16 +860,13 @@ class EnhancedDipperHumanizer:
|
|
| 906 |
|
| 907 |
return text.strip()
|
| 908 |
|
| 909 |
-
def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20
|
| 910 |
"""Paraphrase text using Dipper model with sentence-level processing"""
|
| 911 |
if not text or len(text.strip()) < 3:
|
| 912 |
return text
|
| 913 |
|
| 914 |
-
# Preserve keywords
|
| 915 |
-
text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
|
| 916 |
-
|
| 917 |
# Split into sentences for better control
|
| 918 |
-
sentences = self.split_into_sentences_advanced(
|
| 919 |
paraphrased_sentences = []
|
| 920 |
|
| 921 |
# Track sentence patterns to avoid repetition
|
|
@@ -928,11 +879,7 @@ class EnhancedDipperHumanizer:
|
|
| 928 |
|
| 929 |
try:
|
| 930 |
# ULTRA-HIGH diversity for Originality AI
|
| 931 |
-
|
| 932 |
-
if has_keywords:
|
| 933 |
-
lex_diversity = 30 # Moderate for keywords
|
| 934 |
-
order_diversity = 10
|
| 935 |
-
elif len(sentence.split()) < 10:
|
| 936 |
lex_diversity = 40 # Very high for short
|
| 937 |
order_diversity = 15
|
| 938 |
else:
|
|
@@ -969,7 +916,7 @@ class EnhancedDipperHumanizer:
|
|
| 969 |
max_new_length = int(original_length * 1.4)
|
| 970 |
|
| 971 |
# High variation parameters
|
| 972 |
-
temp = 0.
|
| 973 |
top_p_val = 0.9
|
| 974 |
|
| 975 |
with torch.no_grad():
|
|
@@ -1017,9 +964,6 @@ class EnhancedDipperHumanizer:
|
|
| 1017 |
# Join sentences back
|
| 1018 |
result = ' '.join(paraphrased_sentences)
|
| 1019 |
|
| 1020 |
-
# Restore keywords AFTER joining all sentences
|
| 1021 |
-
result = self.restore_keywords_robust(result, keyword_map)
|
| 1022 |
-
|
| 1023 |
# Apply natural human patterns
|
| 1024 |
result = self.add_natural_human_patterns(result)
|
| 1025 |
|
|
@@ -1089,8 +1033,8 @@ class EnhancedDipperHumanizer:
|
|
| 1089 |
generated += '.'
|
| 1090 |
|
| 1091 |
# Ensure first letter is capitalized ONLY if it's sentence start
|
| 1092 |
-
# Don't capitalize words like "iPhone" or "eBay"
|
| 1093 |
-
if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0])
|
| 1094 |
generated = generated[0].upper() + generated[1:]
|
| 1095 |
|
| 1096 |
return generated
|
|
@@ -1111,17 +1055,14 @@ class EnhancedDipperHumanizer:
|
|
| 1111 |
# Clean up sentences
|
| 1112 |
return [s for s in sentences if s and len(s.strip()) > 0]
|
| 1113 |
|
| 1114 |
-
def paraphrase_with_bart(self, text
|
| 1115 |
"""Additional paraphrasing with BART for more variation"""
|
| 1116 |
if not self.use_bart or not text or len(text.strip()) < 3:
|
| 1117 |
return text
|
| 1118 |
|
| 1119 |
try:
|
| 1120 |
-
# Preserve keywords
|
| 1121 |
-
text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
|
| 1122 |
-
|
| 1123 |
# Process in smaller chunks for BART
|
| 1124 |
-
sentences = self.split_into_sentences_advanced(
|
| 1125 |
paraphrased_sentences = []
|
| 1126 |
|
| 1127 |
for sentence in sentences:
|
|
@@ -1166,9 +1107,6 @@ class EnhancedDipperHumanizer:
|
|
| 1166 |
|
| 1167 |
result = ' '.join(paraphrased_sentences)
|
| 1168 |
|
| 1169 |
-
# Restore keywords AFTER joining all sentences
|
| 1170 |
-
result = self.restore_keywords_robust(result, keyword_map)
|
| 1171 |
-
|
| 1172 |
# Apply minimal grammar fixes
|
| 1173 |
result = self.grammar_fixer.smart_fix(result)
|
| 1174 |
|
|
@@ -1279,8 +1217,7 @@ class EnhancedDipperHumanizer:
|
|
| 1279 |
first_word = words[0]
|
| 1280 |
# Check if it's not an acronym or proper noun that should stay lowercase
|
| 1281 |
if (first_word[0].islower() and
|
| 1282 |
-
not self.is_likely_acronym_or_proper_noun(first_word)
|
| 1283 |
-
not first_word.startswith('KWPH')):
|
| 1284 |
# Only capitalize if it's a regular word
|
| 1285 |
sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
|
| 1286 |
|
|
@@ -1427,57 +1364,6 @@ class EnhancedDipperHumanizer:
|
|
| 1427 |
|
| 1428 |
return html_text
|
| 1429 |
|
| 1430 |
-
def wrap_keywords_in_bold(self, html_content, keywords):
|
| 1431 |
-
"""Wrap all keyword occurrences with <strong> tags - FIXED VERSION"""
|
| 1432 |
-
if not keywords:
|
| 1433 |
-
return html_content
|
| 1434 |
-
|
| 1435 |
-
# Parse the HTML
|
| 1436 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
| 1437 |
-
|
| 1438 |
-
# Process each keyword
|
| 1439 |
-
for keyword in keywords:
|
| 1440 |
-
# Find all text nodes that contain this keyword
|
| 1441 |
-
for element in soup.find_all(string=re.compile(re.escape(keyword), re.IGNORECASE)):
|
| 1442 |
-
# Skip if already inside certain tags
|
| 1443 |
-
parent = element.parent
|
| 1444 |
-
if parent and parent.name in ['script', 'style', 'strong', 'b', 'a', 'button',
|
| 1445 |
-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
|
| 1446 |
-
continue
|
| 1447 |
-
|
| 1448 |
-
# Get the text content
|
| 1449 |
-
text = str(element)
|
| 1450 |
-
|
| 1451 |
-
# Replace all occurrences of the keyword with <strong> wrapped version
|
| 1452 |
-
# Use a regex to preserve the original case
|
| 1453 |
-
pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
|
| 1454 |
-
|
| 1455 |
-
# Find all matches
|
| 1456 |
-
matches = list(pattern.finditer(text))
|
| 1457 |
-
|
| 1458 |
-
if matches:
|
| 1459 |
-
# Build the new text with <strong> tags
|
| 1460 |
-
new_text = ""
|
| 1461 |
-
last_end = 0
|
| 1462 |
-
|
| 1463 |
-
for match in matches:
|
| 1464 |
-
# Add text before the match
|
| 1465 |
-
new_text += text[last_end:match.start()]
|
| 1466 |
-
# Add the wrapped keyword (preserving original case)
|
| 1467 |
-
new_text += f"<strong>{match.group(0)}</strong>"
|
| 1468 |
-
last_end = match.end()
|
| 1469 |
-
|
| 1470 |
-
# Add remaining text
|
| 1471 |
-
new_text += text[last_end:]
|
| 1472 |
-
|
| 1473 |
-
# Replace the text node with new HTML
|
| 1474 |
-
new_soup = BeautifulSoup(new_text, 'html.parser')
|
| 1475 |
-
for new_element in reversed(list(new_soup.contents)):
|
| 1476 |
-
element.insert_after(new_element)
|
| 1477 |
-
element.extract()
|
| 1478 |
-
|
| 1479 |
-
return str(soup)
|
| 1480 |
-
|
| 1481 |
def add_natural_flow_variations(self, text):
|
| 1482 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
| 1483 |
sentences = self.split_into_sentences_advanced(text)
|
|
@@ -1540,7 +1426,7 @@ class EnhancedDipperHumanizer:
|
|
| 1540 |
|
| 1541 |
return ' '.join(enhanced_sentences)
|
| 1542 |
|
| 1543 |
-
def process_html(self, html_content,
|
| 1544 |
"""Main processing function with progress callback"""
|
| 1545 |
if not html_content.strip():
|
| 1546 |
return "Please provide HTML content."
|
|
@@ -1569,37 +1455,12 @@ class EnhancedDipperHumanizer:
|
|
| 1569 |
# Get the modified HTML
|
| 1570 |
html_content = str(soup_temp)
|
| 1571 |
|
| 1572 |
-
# Combine keywords and clean them
|
| 1573 |
-
all_keywords = []
|
| 1574 |
-
if primary_keywords:
|
| 1575 |
-
# Clean and validate each keyword
|
| 1576 |
-
for k in primary_keywords.split(','):
|
| 1577 |
-
cleaned = k.strip()
|
| 1578 |
-
if cleaned and len(cleaned) > 1: # Skip empty or single-char keywords
|
| 1579 |
-
all_keywords.append(cleaned)
|
| 1580 |
-
if secondary_keywords:
|
| 1581 |
-
for k in secondary_keywords.split(','):
|
| 1582 |
-
cleaned = k.strip()
|
| 1583 |
-
if cleaned and len(cleaned) > 1:
|
| 1584 |
-
all_keywords.append(cleaned)
|
| 1585 |
-
|
| 1586 |
-
# Remove duplicates while preserving order
|
| 1587 |
-
seen = set()
|
| 1588 |
-
unique_keywords = []
|
| 1589 |
-
for k in all_keywords:
|
| 1590 |
-
if k.lower() not in seen:
|
| 1591 |
-
seen.add(k.lower())
|
| 1592 |
-
unique_keywords.append(k)
|
| 1593 |
-
all_keywords = unique_keywords
|
| 1594 |
-
|
| 1595 |
try:
|
| 1596 |
# Extract text elements
|
| 1597 |
soup, text_elements = self.extract_text_from_html(html_content)
|
| 1598 |
|
| 1599 |
total_elements = len(text_elements)
|
| 1600 |
print(f"Found {total_elements} text elements to process (after filtering)")
|
| 1601 |
-
if all_keywords:
|
| 1602 |
-
print(f"Preserving keywords: {all_keywords}")
|
| 1603 |
|
| 1604 |
# Process each text element
|
| 1605 |
processed_count = 0
|
|
@@ -1615,20 +1476,18 @@ class EnhancedDipperHumanizer:
|
|
| 1615 |
if len(original_text.split()) < 3:
|
| 1616 |
continue
|
| 1617 |
|
| 1618 |
-
# First pass with Dipper
|
| 1619 |
paraphrased_text = self.paraphrase_with_dipper(
|
| 1620 |
original_text,
|
| 1621 |
-
|
|
|
|
| 1622 |
)
|
| 1623 |
|
| 1624 |
# Second pass with BART for longer texts (increased probability)
|
| 1625 |
if self.use_bart and len(paraphrased_text.split()) > 8:
|
| 1626 |
-
# 50% chance to use BART for more variation
|
| 1627 |
if random.random() < 0.2:
|
| 1628 |
-
paraphrased_text = self.paraphrase_with_bart(
|
| 1629 |
-
paraphrased_text,
|
| 1630 |
-
keywords=all_keywords
|
| 1631 |
-
)
|
| 1632 |
|
| 1633 |
# Apply sentence variation
|
| 1634 |
paraphrased_text = self.apply_sentence_variation(paraphrased_text)
|
|
@@ -1665,9 +1524,6 @@ class EnhancedDipperHumanizer:
|
|
| 1665 |
placeholder = style_placeholder.format(idx)
|
| 1666 |
result = result.replace(placeholder, style_content)
|
| 1667 |
|
| 1668 |
-
# NOW wrap keywords in bold tags after all processing is complete
|
| 1669 |
-
result = self.wrap_keywords_in_bold(result, all_keywords)
|
| 1670 |
-
|
| 1671 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1672 |
result = self.post_process_html(result)
|
| 1673 |
|
|
@@ -1802,7 +1658,7 @@ class EnhancedDipperHumanizer:
|
|
| 1802 |
# Initialize the humanizer
|
| 1803 |
humanizer = EnhancedDipperHumanizer()
|
| 1804 |
|
| 1805 |
-
def humanize_html(html_input,
|
| 1806 |
"""Gradio interface function with progress updates"""
|
| 1807 |
if not html_input:
|
| 1808 |
return "Please provide HTML content to humanize."
|
|
@@ -1818,8 +1674,6 @@ def humanize_html(html_input, primary_keywords="", secondary_keywords="", progre
|
|
| 1818 |
# Pass progress callback to process_html
|
| 1819 |
result = humanizer.process_html(
|
| 1820 |
html_input,
|
| 1821 |
-
primary_keywords,
|
| 1822 |
-
secondary_keywords,
|
| 1823 |
progress_callback=progress_callback
|
| 1824 |
)
|
| 1825 |
|
|
@@ -1837,14 +1691,6 @@ iface = gr.Interface(
|
|
| 1837 |
lines=10,
|
| 1838 |
placeholder="Paste your HTML content here...",
|
| 1839 |
label="HTML Input"
|
| 1840 |
-
),
|
| 1841 |
-
gr.Textbox(
|
| 1842 |
-
placeholder="Enter primary keywords separated by commas (e.g., GMAT Focus Edition, MBA, Data Insights)",
|
| 1843 |
-
label="Primary Keywords (preserved exactly)"
|
| 1844 |
-
),
|
| 1845 |
-
gr.Textbox(
|
| 1846 |
-
placeholder="Enter secondary keywords separated by commas (e.g., test preparation, business school)",
|
| 1847 |
-
label="Secondary Keywords (preserved exactly)"
|
| 1848 |
)
|
| 1849 |
],
|
| 1850 |
outputs=gr.Textbox(
|
|
@@ -1861,8 +1707,6 @@ iface = gr.Interface(
|
|
| 1861 |
- Natural typos, contractions, and conversational flow
|
| 1862 |
- Stream-of-consciousness elements and rhetorical questions
|
| 1863 |
- Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
|
| 1864 |
-
- Fixed placeholder system that preserves keywords
|
| 1865 |
-
- Keywords are automatically wrapped with <strong> tags
|
| 1866 |
- Skips content in <strong>, <b>, and heading tags (including inside tables)
|
| 1867 |
- Designed to pass the strictest AI detection systems
|
| 1868 |
|
|
@@ -1876,7 +1720,7 @@ iface = gr.Interface(
|
|
| 1876 |
<div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
|
| 1877 |
<p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
|
| 1878 |
<p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
|
| 1879 |
-
</article>"""
|
| 1880 |
],
|
| 1881 |
theme="default"
|
| 1882 |
)
|
|
|
|
| 352 |
return variation(sentence)
|
| 353 |
except:
|
| 354 |
return sentence
|
| 355 |
+
|
| 356 |
+
def split_into_sentences_advanced(self, text):
|
| 357 |
+
"""Advanced sentence splitting using spaCy or NLTK"""
|
| 358 |
+
if SPACY_AVAILABLE:
|
| 359 |
+
try:
|
| 360 |
+
nlp = spacy.load("en_core_web_sm")
|
| 361 |
+
doc = nlp(text)
|
| 362 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
| 363 |
+
except:
|
| 364 |
+
sentences = sent_tokenize(text)
|
| 365 |
+
else:
|
| 366 |
+
# Fallback to NLTK
|
| 367 |
+
try:
|
| 368 |
+
sentences = sent_tokenize(text)
|
| 369 |
+
except:
|
| 370 |
+
# Final fallback to regex
|
| 371 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 372 |
+
|
| 373 |
+
# Clean up sentences
|
| 374 |
+
return [s for s in sentences if s and len(s.strip()) > 0]
|
| 375 |
|
| 376 |
class SelectiveGrammarFixer:
|
| 377 |
"""Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
|
|
|
|
| 662 |
|
| 663 |
return text
|
| 664 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
def should_skip_element(self, element, text):
|
| 666 |
"""Determine if an element should be skipped from paraphrasing"""
|
| 667 |
if not text or len(text.strip()) < 3:
|
|
|
|
| 852 |
text = re.sub(r'\s+', ' ', text)
|
| 853 |
|
| 854 |
# Remove leading non-letter characters carefully
|
| 855 |
+
text = re.sub(r'^[^a-zA-Z_]+', '', text)
|
|
|
|
|
|
|
|
|
|
| 856 |
|
| 857 |
# If we accidentally removed too much, use original
|
| 858 |
if len(text) < len(original) * 0.5:
|
|
|
|
| 860 |
|
| 861 |
return text.strip()
|
| 862 |
|
| 863 |
+
def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20):
|
| 864 |
"""Paraphrase text using Dipper model with sentence-level processing"""
|
| 865 |
if not text or len(text.strip()) < 3:
|
| 866 |
return text
|
| 867 |
|
|
|
|
|
|
|
|
|
|
| 868 |
# Split into sentences for better control
|
| 869 |
+
sentences = self.split_into_sentences_advanced(text)
|
| 870 |
paraphrased_sentences = []
|
| 871 |
|
| 872 |
# Track sentence patterns to avoid repetition
|
|
|
|
| 879 |
|
| 880 |
try:
|
| 881 |
# ULTRA-HIGH diversity for Originality AI
|
| 882 |
+
if len(sentence.split()) < 10:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 883 |
lex_diversity = 40 # Very high for short
|
| 884 |
order_diversity = 15
|
| 885 |
else:
|
|
|
|
| 916 |
max_new_length = int(original_length * 1.4)
|
| 917 |
|
| 918 |
# High variation parameters
|
| 919 |
+
temp = 0.8
|
| 920 |
top_p_val = 0.9
|
| 921 |
|
| 922 |
with torch.no_grad():
|
|
|
|
| 964 |
# Join sentences back
|
| 965 |
result = ' '.join(paraphrased_sentences)
|
| 966 |
|
|
|
|
|
|
|
|
|
|
| 967 |
# Apply natural human patterns
|
| 968 |
result = self.add_natural_human_patterns(result)
|
| 969 |
|
|
|
|
| 1033 |
generated += '.'
|
| 1034 |
|
| 1035 |
# Ensure first letter is capitalized ONLY if it's sentence start
|
| 1036 |
+
# Don't capitalize words like "iPhone" or "eBay"
|
| 1037 |
+
if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]):
|
| 1038 |
generated = generated[0].upper() + generated[1:]
|
| 1039 |
|
| 1040 |
return generated
|
|
|
|
| 1055 |
# Clean up sentences
|
| 1056 |
return [s for s in sentences if s and len(s.strip()) > 0]
|
| 1057 |
|
| 1058 |
+
def paraphrase_with_bart(self, text):
|
| 1059 |
"""Additional paraphrasing with BART for more variation"""
|
| 1060 |
if not self.use_bart or not text or len(text.strip()) < 3:
|
| 1061 |
return text
|
| 1062 |
|
| 1063 |
try:
|
|
|
|
|
|
|
|
|
|
| 1064 |
# Process in smaller chunks for BART
|
| 1065 |
+
sentences = self.split_into_sentences_advanced(text)
|
| 1066 |
paraphrased_sentences = []
|
| 1067 |
|
| 1068 |
for sentence in sentences:
|
|
|
|
| 1107 |
|
| 1108 |
result = ' '.join(paraphrased_sentences)
|
| 1109 |
|
|
|
|
|
|
|
|
|
|
| 1110 |
# Apply minimal grammar fixes
|
| 1111 |
result = self.grammar_fixer.smart_fix(result)
|
| 1112 |
|
|
|
|
| 1217 |
first_word = words[0]
|
| 1218 |
# Check if it's not an acronym or proper noun that should stay lowercase
|
| 1219 |
if (first_word[0].islower() and
|
| 1220 |
+
not self.is_likely_acronym_or_proper_noun(first_word)):
|
|
|
|
| 1221 |
# Only capitalize if it's a regular word
|
| 1222 |
sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
|
| 1223 |
|
|
|
|
| 1364 |
|
| 1365 |
return html_text
|
| 1366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1367 |
def add_natural_flow_variations(self, text):
|
| 1368 |
"""Add more natural flow and rhythm variations for Originality AI"""
|
| 1369 |
sentences = self.split_into_sentences_advanced(text)
|
|
|
|
| 1426 |
|
| 1427 |
return ' '.join(enhanced_sentences)
|
| 1428 |
|
| 1429 |
+
def process_html(self, html_content, progress_callback=None):
|
| 1430 |
"""Main processing function with progress callback"""
|
| 1431 |
if not html_content.strip():
|
| 1432 |
return "Please provide HTML content."
|
|
|
|
| 1455 |
# Get the modified HTML
|
| 1456 |
html_content = str(soup_temp)
|
| 1457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
try:
|
| 1459 |
# Extract text elements
|
| 1460 |
soup, text_elements = self.extract_text_from_html(html_content)
|
| 1461 |
|
| 1462 |
total_elements = len(text_elements)
|
| 1463 |
print(f"Found {total_elements} text elements to process (after filtering)")
|
|
|
|
|
|
|
| 1464 |
|
| 1465 |
# Process each text element
|
| 1466 |
processed_count = 0
|
|
|
|
| 1476 |
if len(original_text.split()) < 3:
|
| 1477 |
continue
|
| 1478 |
|
| 1479 |
+
# First pass with Dipper
|
| 1480 |
paraphrased_text = self.paraphrase_with_dipper(
|
| 1481 |
original_text,
|
| 1482 |
+
lex_diversity=60,
|
| 1483 |
+
order_diversity=20
|
| 1484 |
)
|
| 1485 |
|
| 1486 |
# Second pass with BART for longer texts (increased probability)
|
| 1487 |
if self.use_bart and len(paraphrased_text.split()) > 8:
|
| 1488 |
+
# 50% chance to use BART for more variation
|
| 1489 |
if random.random() < 0.2:
|
| 1490 |
+
paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
|
|
|
|
|
|
|
|
|
|
| 1491 |
|
| 1492 |
# Apply sentence variation
|
| 1493 |
paraphrased_text = self.apply_sentence_variation(paraphrased_text)
|
|
|
|
| 1524 |
placeholder = style_placeholder.format(idx)
|
| 1525 |
result = result.replace(placeholder, style_content)
|
| 1526 |
|
|
|
|
|
|
|
|
|
|
| 1527 |
# Post-process the entire HTML to fix bold/strong formatting
|
| 1528 |
result = self.post_process_html(result)
|
| 1529 |
|
|
|
|
| 1658 |
# Initialize the humanizer
|
| 1659 |
humanizer = EnhancedDipperHumanizer()
|
| 1660 |
|
| 1661 |
+
def humanize_html(html_input, progress=gr.Progress()):
|
| 1662 |
"""Gradio interface function with progress updates"""
|
| 1663 |
if not html_input:
|
| 1664 |
return "Please provide HTML content to humanize."
|
|
|
|
| 1674 |
# Pass progress callback to process_html
|
| 1675 |
result = humanizer.process_html(
|
| 1676 |
html_input,
|
|
|
|
|
|
|
| 1677 |
progress_callback=progress_callback
|
| 1678 |
)
|
| 1679 |
|
|
|
|
| 1691 |
lines=10,
|
| 1692 |
placeholder="Paste your HTML content here...",
|
| 1693 |
label="HTML Input"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1694 |
)
|
| 1695 |
],
|
| 1696 |
outputs=gr.Textbox(
|
|
|
|
| 1707 |
- Natural typos, contractions, and conversational flow
|
| 1708 |
- Stream-of-consciousness elements and rhetorical questions
|
| 1709 |
- Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
|
|
|
|
|
|
|
| 1710 |
- Skips content in <strong>, <b>, and heading tags (including inside tables)
|
| 1711 |
- Designed to pass the strictest AI detection systems
|
| 1712 |
|
|
|
|
| 1720 |
<div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
|
| 1721 |
<p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
|
| 1722 |
<p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
|
| 1723 |
+
</article>"""]
|
| 1724 |
],
|
| 1725 |
theme="default"
|
| 1726 |
)
|