Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,14 @@ import numpy as np
|
|
| 8 |
from collections import defaultdict
|
| 9 |
from sklearn.cluster import DBSCAN
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Конфигурация
|
| 13 |
MODELS = {
|
|
@@ -160,13 +168,7 @@ class TextAnalyzer:
|
|
| 160 |
return float(scores[1]) - float(scores[0])
|
| 161 |
|
| 162 |
def _get_context_embedding(self, text):
|
| 163 |
-
|
| 164 |
-
tokenizer = self.tokenizers['context']
|
| 165 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
|
| 166 |
-
with torch.no_grad():
|
| 167 |
-
outputs = model(**inputs)
|
| 168 |
-
embeddings = outputs.last_hidden_state.mean(dim=1)
|
| 169 |
-
return embeddings.cpu().numpy()
|
| 170 |
|
| 171 |
class TextBlockAnalyzer:
|
| 172 |
def __init__(self):
|
|
@@ -184,7 +186,7 @@ class TextBlockAnalyzer:
|
|
| 184 |
|
| 185 |
def analyze_text(self, text):
|
| 186 |
cleaned_text = self.preprocess_text(text)
|
| 187 |
-
blocks = self.
|
| 188 |
if not blocks:
|
| 189 |
return self._empty_result()
|
| 190 |
print(f"Found {len(blocks)} blocks")
|
|
@@ -211,36 +213,22 @@ class TextBlockAnalyzer:
|
|
| 211 |
lines.append(cleaned_line)
|
| 212 |
return '\n'.join(lines)
|
| 213 |
|
| 214 |
-
def
|
| 215 |
-
|
| 216 |
blocks = []
|
| 217 |
current_block = []
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
current_block = [line]
|
| 225 |
-
block_emotion_vectors = [line_emotion_vector]
|
| 226 |
-
else:
|
| 227 |
-
block_vector = np.mean(block_emotion_vectors, axis=0)
|
| 228 |
-
similarity = cosine_sim(block_vector, line_emotion_vector)
|
| 229 |
-
if similarity >= threshold and len(current_block) < max_block_lines:
|
| 230 |
-
current_block.append(line)
|
| 231 |
-
block_emotion_vectors.append(line_emotion_vector)
|
| 232 |
-
else:
|
| 233 |
-
if len(current_block) >= min_block_lines:
|
| 234 |
-
blocks.append('\n'.join(current_block))
|
| 235 |
-
current_block = [line]
|
| 236 |
-
block_emotion_vectors = [line_emotion_vector]
|
| 237 |
-
if len(current_block) >= min_block_lines:
|
| 238 |
blocks.append('\n'.join(current_block))
|
| 239 |
return blocks
|
| 240 |
|
| 241 |
def _cluster_blocks(self, block_analyses):
|
| 242 |
try:
|
| 243 |
-
embeddings = np.array([analysis['context_embedding']
|
| 244 |
clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
|
| 245 |
self.emotion_clusters.clear()
|
| 246 |
for idx, label in enumerate(clustering.labels_):
|
|
|
|
| 8 |
from collections import defaultdict
|
| 9 |
from sklearn.cluster import DBSCAN
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
+
import spacy
|
| 12 |
+
from sentence_transformers import SentenceTransformer
|
| 13 |
+
|
| 14 |
+
# Загрузка модели для контекстных эмбеддингов
|
| 15 |
+
sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
| 16 |
+
|
| 17 |
+
# Загрузка spaCy для обработки текста
|
| 18 |
+
nlp = spacy.load("ru_core_news_sm")
|
| 19 |
|
| 20 |
# Конфигурация
|
| 21 |
MODELS = {
|
|
|
|
| 168 |
return float(scores[1]) - float(scores[0])
|
| 169 |
|
| 170 |
def _get_context_embedding(self, text):
|
| 171 |
+
return sentence_model.encode(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
class TextBlockAnalyzer:
|
| 174 |
def __init__(self):
|
|
|
|
| 186 |
|
| 187 |
def analyze_text(self, text):
|
| 188 |
cleaned_text = self.preprocess_text(text)
|
| 189 |
+
blocks = self.split_into_meaningful_blocks(cleaned_text)
|
| 190 |
if not blocks:
|
| 191 |
return self._empty_result()
|
| 192 |
print(f"Found {len(blocks)} blocks")
|
|
|
|
| 213 |
lines.append(cleaned_line)
|
| 214 |
return '\n'.join(lines)
|
| 215 |
|
| 216 |
+
def split_into_meaningful_blocks(self, text):
|
| 217 |
+
doc = nlp(text)
|
| 218 |
blocks = []
|
| 219 |
current_block = []
|
| 220 |
+
for sent in doc.sents:
|
| 221 |
+
current_block.append(sent.text)
|
| 222 |
+
if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
|
| 223 |
+
blocks.append('\n'.join(current_block))
|
| 224 |
+
current_block = []
|
| 225 |
+
if current_block:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
blocks.append('\n'.join(current_block))
|
| 227 |
return blocks
|
| 228 |
|
| 229 |
def _cluster_blocks(self, block_analyses):
|
| 230 |
try:
|
| 231 |
+
embeddings = np.array([analysis['context_embedding'] for analysis in block_analyses])
|
| 232 |
clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
|
| 233 |
self.emotion_clusters.clear()
|
| 234 |
for idx, label in enumerate(clustering.labels_):
|