Spaces:

cnmoro
/

SemanticCompression

Running

App Files Files Community

cnmoro commited on Jun 21, 2024

Commit

96d2af2

verified ·

1 Parent(s): 4dd98b8

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -118

app.py CHANGED Viewed

@@ -1,147 +1,102 @@
-import gradio as gr
 from minivectordb.embedding_model import EmbeddingModel
-from minivectordb.vector_database import VectorDatabase
-from multiprocessing import cpu_count
-from functools import lru_cache
-import fasttext, random, tiktoken, os, pickle
-import concurrent.futures
-os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 langdetect_model = fasttext.load_model('lid.176.ftz')
-embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
-en_stop_words = pickle.load(open("en_stopwords.pkl", "rb"))
-pt_stop_words = pickle.load(open("pt_stopwords.pkl", "rb"))
 tokenizer = tiktoken.encoding_for_model("gpt-4")
 def count_tokens_tiktoken(text):
     return len(tokenizer.encode(text))
-def detect_language_en_pt(text):
     detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
-    result = str(detected_lang).replace('__label__', '')
-    if result == 'pt':
-        return 'pt'
-    return 'en'
-def generate_combinations(text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=100, keep_tokens=None):
-    if keep_tokens is None:
-        keep_tokens = {"\n", ".", ",", ";", "!", "?"}
-    if word_reduction_factor is None:
-        word_reduction_factor = 0.5
-    words = text.split()
-    total_words = len(words)
-    num_remove = int(total_words * word_reduction_factor)
-    # Update index identification to exclude keep_tokens
-    stopword_indices = [i for i, word in enumerate(words) if word.lower() in stopwords and word not in keep_tokens]
-    non_stopword_indices = [i for i, word in enumerate(words) if word.lower() not in stopwords and word not in keep_tokens]
-    non_stopword_words = [word for i, word in enumerate(words) if i in non_stopword_indices]
-    # Get the embeddings for the non-stopword words
-    non_stopword_embeddings = extract_embeddings_batch(non_stopword_words)
-    # Calculate the cosine similarity between the original text embedding and the non-stopword words
-    original_text_embedding = semantic_embeddings
-    # Calculate the cosine similarity between the original text embedding and the non-stopword words
-    semantic_db = VectorDatabase()
-    ids = [i for i in range(len(non_stopword_words))]
-    metadata_dicts = [{"w": word} for word in non_stopword_words]
-    semantic_db.store_embeddings_batch(ids, non_stopword_embeddings, metadata_dicts)
-    _, _, ordered_words_metadata = semantic_db.find_most_similar(original_text_embedding, k=len(non_stopword_words))
-    ordered_words = [meta['w'] for meta in ordered_words_metadata]
-    # Create a mapping from word to index for quick lookup
-    word_to_index = {word: i for i, word in enumerate(words)}
-    # Get the ordered indices based on semantic importance (less important words last)
-    ordered_indices = [word_to_index[word] for word in ordered_words if word in word_to_index]
-    # Determine the high-priority words to always keep
-    high_priority_count = len(ordered_indices) - num_remove
-    high_priority_count = max(high_priority_count, 0)  # Ensure it's not negative
-    high_priority_indices = ordered_indices[:high_priority_count]
-    combinations = []
-    for _ in range(num_samples):
-        # Calculate remaining words to remove
-        remaining_remove = num_remove
-        # Ensure we don't try to sample more items than exist
-        if len(stopword_indices) > 0:
-            num_stop = random.randint(0, min(remaining_remove, len(stopword_indices)))
-        else:
-            num_stop = 0
-        remaining_remove -= num_stop
-        if remaining_remove > 0:
-            lower_priority_indices = ordered_indices[high_priority_count:]
-            num_non_stop = min(remaining_remove, len(lower_priority_indices))  # Ensure we don't sample more than available
-            prioritized_non_stop_indices = random.sample(lower_priority_indices, num_non_stop) if num_non_stop > 0 else []
         else:
-            prioritized_non_stop_indices = []
-        stop_comb = random.sample(stopword_indices, num_stop) if num_stop > 0 else []
-        combination = set(stop_comb + prioritized_non_stop_indices)
-        new_string = [word for i, word in enumerate(words) if i not in combination or i in high_priority_indices]
-        combinations.append(' '.join(new_string))
-    return list(set(combinations))
-@lru_cache(maxsize=50000)
-def extract_embeddings(text):
-    return embedding_model.extract_embeddings(text)
-def extract_embeddings_batch(texts):
-    return [extract_embeddings(text) for text in texts]
-def compress_semantically(input_text, word_reduction_factor=0.35):
-    num_samples = 500
-    word_count = len(input_text.split())
-    thresholds = [(1500, 80), (1000, 90), (700, 110), (500, 130), (250, 160)]
-    for threshold, value in thresholds:
-        if word_count > threshold:
-            num_samples = value
-            break
-    semantic_embeddings = extract_embeddings(input_text)
-    text_lang = detect_language_en_pt(input_text)
-    stopwords = en_stop_words if text_lang == 'en' else pt_stop_words
-    text_combinations = generate_combinations(input_text, word_reduction_factor, stopwords, semantic_embeddings, num_samples=num_samples)
-    n = int(num_samples / cpu_count())
-    # Aggregate text_combinations into blocks of "n"
-    text_combinations_chunks = [text_combinations[i:i + n] for i in range(0, len(text_combinations), n)]
-    # Calculate the embeddings for each combination
-    combinations_embeddings = []
-    with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
-        for embeddings in executor.map(extract_embeddings_batch, text_combinations_chunks):
-            combinations_embeddings.extend(embeddings)
-    semantic_db = VectorDatabase()
-    unique_ids = [ i for i in range(len(text_combinations)) ]
-    metadata_dicts = [ {"text": text} for text in text_combinations ]
-    semantic_db.store_embeddings_batch(unique_ids, combinations_embeddings, metadata_dicts)
-    _, _, result = semantic_db.find_most_similar(semantic_embeddings, k=1)
-    best_compressed_sentence = result[0]['text']
-    return best_compressed_sentence
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 700:
         return "Text is too long for this demo. Please provide a text with less than 700 words."
-    compressed = compress_semantically(text, word_reduction_factor = word_reduction_factor)
     perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
     return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
@@ -162,7 +117,7 @@ reduction_factor = gr.Slider(
     value=0.5,
     step=0.05,
     interactive=True,
-    label="Word Reduction Factor"
 )
 # Create the gradio interface
 gr.Interface(

+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
 from minivectordb.embedding_model import EmbeddingModel
+from sklearn.metrics.pairwise import cosine_similarity
+import tiktoken, nltk, numpy as np, fasttext, pickle
+from nltk.tokenize import sent_tokenize
+import gradio as gr
+nltk.download('punkt')
+nltk.download('stopwords')
 langdetect_model = fasttext.load_model('lid.176.ftz')
+embedding_model = EmbeddingModel(onnx_model_cpu_core_count=2)
+english_stopwords = pickle.load(open("en_stopwords.pkl", "rb"))
+portuguese_stopwords = pickle.load(open("pt_stopwords.pkl", "rb"))
 tokenizer = tiktoken.encoding_for_model("gpt-4")
 def count_tokens_tiktoken(text):
     return len(tokenizer.encode(text))
+def detect_language(text):
     detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
+    return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
+def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
+    def calculate_similarity(embed1, embed2):
+        return cosine_similarity([embed1], [embed2])[0][0]
+    def create_lda_model(texts, stopwords):
+        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
+        doc_term_matrix = vectorizer.fit_transform(texts)
+        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
+        lda.fit(doc_term_matrix)
+        return lda, vectorizer
+    def get_topic_distribution(text, lda, vectorizer):
+        vec = vectorizer.transform([text])
+        return lda.transform(vec)[0]
+    def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
+        sentence_embedding = embedding_model.extract_embeddings(sentence)
+        semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
+        topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
+        topic_importance = np.max(topic_dist)
+        # Calculate lexical diversity
+        words = sentence.split()
+        unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
+        lexical_diversity = len(unique_words) / len(words) if words else 0
+        # Combine factors (you can adjust weights as needed)
+        importance = (0.4 * semantic_similarity) + (0.4 * topic_importance) + (0.2 * lexical_diversity)
+        return importance
+    # Split the text into sentences
+    sentences = sent_tokenize(full_text)
+    text_lang = detect_language(full_text)
+    # Create LDA model
+    lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
+    # Get document-level embedding
+    doc_embedding = embedding_model.extract_embeddings(full_text)
+    # Calculate importance for each sentence
+    sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
+                       for sentence in sentences]
+    # Sort sentences by importance
+    sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
+    # Determine how many words to keep
+    total_words = sum(len(sentence.split()) for sentence in sentences)
+    target_words = int(total_words * compression_rate)
+    # Reconstruct the compressed text
+    compressed_text = []
+    current_words = 0
+    for sentence, _ in sorted_sentences:
+        sentence_words = len(sentence.split())
+        if current_words + sentence_words <= target_words:
+            compressed_text.append(sentence)
+            current_words += sentence_words
         else:
+            break
+    # Reorder sentences to maintain original flow
+    compressed_text.sort(key=lambda x: sentences.index(x))
+    return ' '.join(compressed_text)
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 700:
         return "Text is too long for this demo. Please provide a text with less than 700 words."
+    compressed = semantic_compress_text(text, word_reduction_factor = 1 - word_reduction_factor)
     perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
     return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
     value=0.5,
     step=0.05,
     interactive=True,
+    label="Reduction Factor"
 )
 # Create the gradio interface
 gr.Interface(