Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,22 +23,31 @@ def clean_text(text):
|
|
| 23 |
return ' '.join(cleaned_sentences)
|
| 24 |
|
| 25 |
def process_text(text):
|
| 26 |
-
"""Insert a newline after periods, except for titles and ."
|
|
|
|
| 27 |
# Split text into words
|
| 28 |
words = text.split()
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
for i, word in enumerate(words):
|
| 32 |
# Check if the word is a title (e.g., Mr., Mrs.)
|
| 33 |
if word in TITLES:
|
| 34 |
-
|
| 35 |
# Check if the word ends with a period and is not followed by a quote
|
| 36 |
elif word.endswith('.') and not word.endswith('."'):
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
else:
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
def combine_dataset_texts(dataset_name, split, text_column):
|
| 44 |
try:
|
|
@@ -56,6 +65,7 @@ def combine_dataset_texts(dataset_name, split, text_column):
|
|
| 56 |
cleaned_text = clean_text(combined_text)
|
| 57 |
|
| 58 |
# Process the text: insert newlines after periods, except for titles and ."
|
|
|
|
| 59 |
processed_text = process_text(cleaned_text)
|
| 60 |
|
| 61 |
# Create a temporary file
|
|
|
|
| 23 |
return ' '.join(cleaned_sentences)
|
| 24 |
|
| 25 |
def process_text(text):
|
| 26 |
+
"""Insert a newline after periods, except for titles and ."
|
| 27 |
+
Also replace '### Simplified Version' with 'Chapter N' where N increments."""
|
| 28 |
# Split text into words
|
| 29 |
words = text.split()
|
| 30 |
+
processed_text = ""
|
| 31 |
+
chapter_counter = 1 # Initialize chapter counter
|
| 32 |
|
| 33 |
for i, word in enumerate(words):
|
| 34 |
# Check if the word is a title (e.g., Mr., Mrs.)
|
| 35 |
if word in TITLES:
|
| 36 |
+
processed_text += word + " "
|
| 37 |
# Check if the word ends with a period and is not followed by a quote
|
| 38 |
elif word.endswith('.') and not word.endswith('."'):
|
| 39 |
+
processed_text += word + "\n"
|
| 40 |
+
# Replace '### Simplified Version' with 'Chapter N'
|
| 41 |
+
elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
|
| 42 |
+
processed_text += f"Chapter {chapter_counter} "
|
| 43 |
+
chapter_counter += 1 # Increment chapter counter
|
| 44 |
+
words[i + 1] = "" # Skip the next two words
|
| 45 |
+
words[i + 2] = ""
|
| 46 |
else:
|
| 47 |
+
processed_text += word + " "
|
| 48 |
|
| 49 |
+
# Remove trailing spaces and newlines
|
| 50 |
+
return processed_text.strip()
|
| 51 |
|
| 52 |
def combine_dataset_texts(dataset_name, split, text_column):
|
| 53 |
try:
|
|
|
|
| 65 |
cleaned_text = clean_text(combined_text)
|
| 66 |
|
| 67 |
# Process the text: insert newlines after periods, except for titles and ."
|
| 68 |
+
# Also replace '### Simplified Version' with 'Chapter N'
|
| 69 |
processed_text = process_text(cleaned_text)
|
| 70 |
|
| 71 |
# Create a temporary file
|