Spaces:

mechtnet
/

analyse_text

Sleeping

App Files Files Community

mechtnet commited on Jan 14, 2025

Commit

906785a

verified ·

1 Parent(s): f6dc68f

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -31

app.py CHANGED Viewed

@@ -8,6 +8,14 @@ import numpy as np
 from collections import defaultdict
 from sklearn.cluster import DBSCAN
 from sklearn.metrics.pairwise import cosine_similarity
 # Конфигурация
 MODELS = {
@@ -160,13 +168,7 @@ class TextAnalyzer:
         return float(scores[1]) - float(scores[0])
     def _get_context_embedding(self, text):
-        model = self.models['context']
-        tokenizer = self.tokenizers['context']
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            embeddings = outputs.last_hidden_state.mean(dim=1)
-        return embeddings.cpu().numpy()
 class TextBlockAnalyzer:
     def __init__(self):
@@ -184,7 +186,7 @@ class TextBlockAnalyzer:
     def analyze_text(self, text):
         cleaned_text = self.preprocess_text(text)
-        blocks = self.split_into_dynamic_blocks(cleaned_text, self.EMOTION_LABELS, threshold=ANALYSIS_PARAMS['similarity_threshold'])
         if not blocks:
             return self._empty_result()
         print(f"Found {len(blocks)} blocks")
@@ -211,36 +213,22 @@ class TextBlockAnalyzer:
                 lines.append(cleaned_line)
         return '\n'.join(lines)
-    def split_into_dynamic_blocks(self, text, emotion_labels, threshold=0.7, min_block_lines=2, max_block_lines=4):
-        lines = text.split('\n')
         blocks = []
         current_block = []
-        block_emotion_vectors = []
-        for idx, line in enumerate(lines):
-            line_analysis = self.analyzer.analyze_text_block(line)
-            if line_analysis:
-                line_emotion_vector = emotion_analysis_to_vector(line_analysis['emotions'], emotion_labels)
-                if not current_block:
-                    current_block = [line]
-                    block_emotion_vectors = [line_emotion_vector]
-                else:
-                    block_vector = np.mean(block_emotion_vectors, axis=0)
-                    similarity = cosine_sim(block_vector, line_emotion_vector)
-                    if similarity >= threshold and len(current_block) < max_block_lines:
-                        current_block.append(line)
-                        block_emotion_vectors.append(line_emotion_vector)
-                    else:
-                        if len(current_block) >= min_block_lines:
-                            blocks.append('\n'.join(current_block))
-                        current_block = [line]
-                        block_emotion_vectors = [line_emotion_vector]
-        if len(current_block) >= min_block_lines:
             blocks.append('\n'.join(current_block))
         return blocks
     def _cluster_blocks(self, block_analyses):
         try:
-            embeddings = np.array([analysis['context_embedding'][0] for analysis in block_analyses])
             clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
             self.emotion_clusters.clear()
             for idx, label in enumerate(clustering.labels_):

 from collections import defaultdict
 from sklearn.cluster import DBSCAN
 from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+from sentence_transformers import SentenceTransformer
+# Загрузка модели для контекстных эмбеддингов
+sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+# Загрузка spaCy для обработки текста
+nlp = spacy.load("ru_core_news_sm")
 # Конфигурация
 MODELS = {
         return float(scores[1]) - float(scores[0])
     def _get_context_embedding(self, text):
+        return sentence_model.encode(text)
 class TextBlockAnalyzer:
     def __init__(self):
     def analyze_text(self, text):
         cleaned_text = self.preprocess_text(text)
+        blocks = self.split_into_meaningful_blocks(cleaned_text)
         if not blocks:
             return self._empty_result()
         print(f"Found {len(blocks)} blocks")
                 lines.append(cleaned_line)
         return '\n'.join(lines)
+    def split_into_meaningful_blocks(self, text):
+        doc = nlp(text)
         blocks = []
         current_block = []
+        for sent in doc.sents:
+            current_block.append(sent.text)
+            if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
+                blocks.append('\n'.join(current_block))
+                current_block = []
+        if current_block:
             blocks.append('\n'.join(current_block))
         return blocks
     def _cluster_blocks(self, block_analyses):
         try:
+            embeddings = np.array([analysis['context_embedding'] for analysis in block_analyses])
             clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
             self.emotion_clusters.clear()
             for idx, label in enumerate(clustering.labels_):