Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 2 |
from sklearn.decomposition import LatentDirichletAllocation
|
|
|
|
| 3 |
from minivectordb.embedding_model import EmbeddingModel
|
| 4 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
-
import tiktoken, nltk, numpy as np, fasttext, pickle
|
| 6 |
from nltk.tokenize import sent_tokenize
|
| 7 |
import gradio as gr
|
| 8 |
|
|
@@ -22,6 +22,62 @@ def detect_language(text):
|
|
| 22 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
| 23 |
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
| 26 |
def calculate_similarity(embed1, embed2):
|
| 27 |
return cosine_similarity([embed1], [embed2])[0][0]
|
|
@@ -94,7 +150,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
| 94 |
# Reorder sentences to maintain original flow
|
| 95 |
compressed_text.sort(key=lambda x: sentences.index(x))
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
|
| 99 |
async def predict(text, word_reduction_factor):
|
| 100 |
if len(text.split()) > 5000:
|
|
|
|
| 1 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 2 |
from sklearn.decomposition import LatentDirichletAllocation
|
| 3 |
+
import tiktoken, nltk, numpy as np, fasttext, pickle, re
|
| 4 |
from minivectordb.embedding_model import EmbeddingModel
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 6 |
from nltk.tokenize import sent_tokenize
|
| 7 |
import gradio as gr
|
| 8 |
|
|
|
|
| 22 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
| 23 |
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
| 24 |
|
| 25 |
+
def clean_and_standardize_text(text):
|
| 26 |
+
# 1. Standardize spacing around punctuation
|
| 27 |
+
text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
|
| 28 |
+
|
| 29 |
+
# 2. Remove extra spaces
|
| 30 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 31 |
+
|
| 32 |
+
# 3. Capitalize sentences
|
| 33 |
+
sentences = sent_tokenize(text)
|
| 34 |
+
text = '. '.join(sentence.capitalize() for sentence in sentences)
|
| 35 |
+
|
| 36 |
+
# 4. Standardize number formatting
|
| 37 |
+
text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
|
| 38 |
+
|
| 39 |
+
# 5. Ensure proper spacing after closing parentheses
|
| 40 |
+
text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
|
| 41 |
+
|
| 42 |
+
# 6. Preserve bullet points
|
| 43 |
+
text = re.sub(r'•\s*', '• ', text)
|
| 44 |
+
|
| 45 |
+
# 7. Preserve numbered lists
|
| 46 |
+
text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
|
| 47 |
+
|
| 48 |
+
# 8. Standardize date formatting
|
| 49 |
+
text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
|
| 50 |
+
|
| 51 |
+
# 9. Remove extra periods
|
| 52 |
+
text = re.sub(r'\.\s+\.', '. ', text)
|
| 53 |
+
|
| 54 |
+
# 10. Remove spacing around parentheses
|
| 55 |
+
text = re.sub(r'\(\s*', '(', text)
|
| 56 |
+
text = re.sub(r'\s*\)', ')', text)
|
| 57 |
+
|
| 58 |
+
# 11. Remove extra numbers without meaning
|
| 59 |
+
text = re.sub(r'\b(\d+)\b', '', text)
|
| 60 |
+
|
| 61 |
+
# 12. Improve spacing around punctuations
|
| 62 |
+
while ' .' in text:
|
| 63 |
+
text = text.replace(' .', '.')
|
| 64 |
+
|
| 65 |
+
while '..' in text:
|
| 66 |
+
text = text.replace('..', '.')
|
| 67 |
+
|
| 68 |
+
while ' ' in text:
|
| 69 |
+
text = text.replace(' ', ' ')
|
| 70 |
+
|
| 71 |
+
text = text.replace(' :', ':')
|
| 72 |
+
text = text.replace('- -', '-')
|
| 73 |
+
text = text.replace('. -', '.')
|
| 74 |
+
|
| 75 |
+
# 13. Detect two punctuation marks in a row, keeping the last
|
| 76 |
+
text = re.sub(r'([.,]){2,}', r'\1', text)
|
| 77 |
+
text = re.sub(r'(?<=[:.])[:.]+', '', text)
|
| 78 |
+
|
| 79 |
+
return text
|
| 80 |
+
|
| 81 |
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
| 82 |
def calculate_similarity(embed1, embed2):
|
| 83 |
return cosine_similarity([embed1], [embed2])[0][0]
|
|
|
|
| 150 |
# Reorder sentences to maintain original flow
|
| 151 |
compressed_text.sort(key=lambda x: sentences.index(x))
|
| 152 |
|
| 153 |
+
joined_compressed_text = ' '.join(compressed_text)
|
| 154 |
+
joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
|
| 155 |
+
return joined_compressed_text_cleaned
|
| 156 |
|
| 157 |
async def predict(text, word_reduction_factor):
|
| 158 |
if len(text.split()) > 5000:
|