Spaces:

TeacherPuffy
/

CreateBookPackage

Sleeping

TeacherPuffy commited on Jan 21, 2025

Commit

4e3915c

verified ·

1 Parent(s): 1153ecb

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,26 @@
 import gradio as gr
 from datasets import load_dataset
 import tempfile
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
@@ -11,12 +31,18 @@ def combine_dataset_texts(dataset_name, split, text_column):
         if text_column not in dataset.column_names:
             raise gr.Error(f"Column '{text_column}' not found in dataset")
-        # Combine all texts
-        combined_text = "\n\n".join([example[text_column] for example in dataset])
         # Create a temporary file
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
-            f.write(combined_text)
             return f.name
     except Exception as e:

 import gradio as gr
 from datasets import load_dataset
 import tempfile
+import re
+from langdetect import detect
+def is_english(text):
+    """Check if the text is in English."""
+    try:
+        return detect(text) == 'en'
+    except:
+        return False
+def clean_text(text):
+    """Remove non-English text and ** from the text."""
+    # Remove **
+    text = re.sub(r'\*\*', '', text)
+    # Split text into sentences and filter out non-English sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    cleaned_sentences = [s for s in sentences if is_english(s)]
+    return ' '.join(cleaned_sentences)
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
         if text_column not in dataset.column_names:
             raise gr.Error(f"Column '{text_column}' not found in dataset")
+        # Combine all texts into a single string without separating datapoints
+        combined_text = " ".join([example[text_column] for example in dataset])
+        # Clean the text: remove non-English and **
+        cleaned_text = clean_text(combined_text)
+        # Insert a newline after each period (.) except for ."
+        processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
         # Create a temporary file
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
+            f.write(processed_text)
             return f.name
     except Exception as e: