Spaces:

cnmoro
/

SemanticCompression

Sleeping

App Files Files Community

cnmoro commited on Jul 15, 2024

Commit

3c78773

verified ·

1 Parent(s): dc21de1

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -2

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from minivectordb.embedding_model import EmbeddingModel
 from sklearn.metrics.pairwise import cosine_similarity
-import tiktoken, nltk, numpy as np, fasttext, pickle
 from nltk.tokenize import sent_tokenize
 import gradio as gr
@@ -22,6 +22,62 @@ def detect_language(text):
     detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
     return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
 def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
     def calculate_similarity(embed1, embed2):
         return cosine_similarity([embed1], [embed2])[0][0]
@@ -94,7 +150,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
     # Reorder sentences to maintain original flow
     compressed_text.sort(key=lambda x: sentences.index(x))
-    return ' '.join(compressed_text)
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 5000:

 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
+import tiktoken, nltk, numpy as np, fasttext, pickle, re
 from minivectordb.embedding_model import EmbeddingModel
 from sklearn.metrics.pairwise import cosine_similarity
 from nltk.tokenize import sent_tokenize
 import gradio as gr
     detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
     return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
+def clean_and_standardize_text(text):
+    # 1. Standardize spacing around punctuation
+    text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
+    # 2. Remove extra spaces
+    text = re.sub(r'\s+', ' ', text).strip()
+    # 3. Capitalize sentences
+    sentences = sent_tokenize(text)
+    text = '. '.join(sentence.capitalize() for sentence in sentences)
+    # 4. Standardize number formatting
+    text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
+    # 5. Ensure proper spacing after closing parentheses
+    text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
+    # 6. Preserve bullet points
+    text = re.sub(r'•\s*', '• ', text)
+    # 7. Preserve numbered lists
+    text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
+    # 8. Standardize date formatting
+    text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
+    # 9. Remove extra periods
+    text = re.sub(r'\.\s+\.', '. ', text)
+    # 10. Remove spacing around parentheses
+    text = re.sub(r'\(\s*', '(', text)
+    text = re.sub(r'\s*\)', ')', text)
+    # 11. Remove extra numbers without meaning
+    text = re.sub(r'\b(\d+)\b', '', text)
+    # 12. Improve spacing around punctuations
+    while ' .' in text:
+        text = text.replace(' .', '.')
+    while '..' in text:
+        text = text.replace('..', '.')
+    while '  ' in text:
+        text = text.replace('  ', ' ')
+    text = text.replace(' :', ':')
+    text = text.replace('- -', '-')
+    text = text.replace('. -', '.')
+    # 13. Detect two punctuation marks in a row, keeping the last
+    text = re.sub(r'([.,]){2,}', r'\1', text)
+    text = re.sub(r'(?<=[:.])[:.]+', '', text)
+    return text
 def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
     def calculate_similarity(embed1, embed2):
         return cosine_similarity([embed1], [embed2])[0][0]
     # Reorder sentences to maintain original flow
     compressed_text.sort(key=lambda x: sentences.index(x))
+    joined_compressed_text = ' '.join(compressed_text)
+    joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
+    return joined_compressed_text_cleaned
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 5000: