mechtnet commited on
Commit
906785a
·
verified ·
1 Parent(s): f6dc68f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -31
app.py CHANGED
@@ -8,6 +8,14 @@ import numpy as np
8
  from collections import defaultdict
9
  from sklearn.cluster import DBSCAN
10
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
 
 
11
 
12
  # Конфигурация
13
  MODELS = {
@@ -160,13 +168,7 @@ class TextAnalyzer:
160
  return float(scores[1]) - float(scores[0])
161
 
162
  def _get_context_embedding(self, text):
163
- model = self.models['context']
164
- tokenizer = self.tokenizers['context']
165
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
166
- with torch.no_grad():
167
- outputs = model(**inputs)
168
- embeddings = outputs.last_hidden_state.mean(dim=1)
169
- return embeddings.cpu().numpy()
170
 
171
  class TextBlockAnalyzer:
172
  def __init__(self):
@@ -184,7 +186,7 @@ class TextBlockAnalyzer:
184
 
185
  def analyze_text(self, text):
186
  cleaned_text = self.preprocess_text(text)
187
- blocks = self.split_into_dynamic_blocks(cleaned_text, self.EMOTION_LABELS, threshold=ANALYSIS_PARAMS['similarity_threshold'])
188
  if not blocks:
189
  return self._empty_result()
190
  print(f"Found {len(blocks)} blocks")
@@ -211,36 +213,22 @@ class TextBlockAnalyzer:
211
  lines.append(cleaned_line)
212
  return '\n'.join(lines)
213
 
214
- def split_into_dynamic_blocks(self, text, emotion_labels, threshold=0.7, min_block_lines=2, max_block_lines=4):
215
- lines = text.split('\n')
216
  blocks = []
217
  current_block = []
218
- block_emotion_vectors = []
219
- for idx, line in enumerate(lines):
220
- line_analysis = self.analyzer.analyze_text_block(line)
221
- if line_analysis:
222
- line_emotion_vector = emotion_analysis_to_vector(line_analysis['emotions'], emotion_labels)
223
- if not current_block:
224
- current_block = [line]
225
- block_emotion_vectors = [line_emotion_vector]
226
- else:
227
- block_vector = np.mean(block_emotion_vectors, axis=0)
228
- similarity = cosine_sim(block_vector, line_emotion_vector)
229
- if similarity >= threshold and len(current_block) < max_block_lines:
230
- current_block.append(line)
231
- block_emotion_vectors.append(line_emotion_vector)
232
- else:
233
- if len(current_block) >= min_block_lines:
234
- blocks.append('\n'.join(current_block))
235
- current_block = [line]
236
- block_emotion_vectors = [line_emotion_vector]
237
- if len(current_block) >= min_block_lines:
238
  blocks.append('\n'.join(current_block))
239
  return blocks
240
 
241
  def _cluster_blocks(self, block_analyses):
242
  try:
243
- embeddings = np.array([analysis['context_embedding'][0] for analysis in block_analyses])
244
  clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
245
  self.emotion_clusters.clear()
246
  for idx, label in enumerate(clustering.labels_):
 
8
  from collections import defaultdict
9
  from sklearn.cluster import DBSCAN
10
  from sklearn.metrics.pairwise import cosine_similarity
11
+ import spacy
12
+ from sentence_transformers import SentenceTransformer
13
+
14
+ # Загрузка модели для контекстных эмбеддингов
15
+ sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
16
+
17
+ # Загрузка spaCy для обработки текста
18
+ nlp = spacy.load("ru_core_news_sm")
19
 
20
  # Конфигурация
21
  MODELS = {
 
168
  return float(scores[1]) - float(scores[0])
169
 
170
  def _get_context_embedding(self, text):
171
+ return sentence_model.encode(text)
 
 
 
 
 
 
172
 
173
  class TextBlockAnalyzer:
174
  def __init__(self):
 
186
 
187
  def analyze_text(self, text):
188
  cleaned_text = self.preprocess_text(text)
189
+ blocks = self.split_into_meaningful_blocks(cleaned_text)
190
  if not blocks:
191
  return self._empty_result()
192
  print(f"Found {len(blocks)} blocks")
 
213
  lines.append(cleaned_line)
214
  return '\n'.join(lines)
215
 
216
+ def split_into_meaningful_blocks(self, text):
217
+ doc = nlp(text)
218
  blocks = []
219
  current_block = []
220
+ for sent in doc.sents:
221
+ current_block.append(sent.text)
222
+ if len(current_block) >= ANALYSIS_PARAMS['min_block_lines']:
223
+ blocks.append('\n'.join(current_block))
224
+ current_block = []
225
+ if current_block:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  blocks.append('\n'.join(current_block))
227
  return blocks
228
 
229
  def _cluster_blocks(self, block_analyses):
230
  try:
231
+ embeddings = np.array([analysis['context_embedding'] for analysis in block_analyses])
232
  clustering = DBSCAN(eps=ANALYSIS_PARAMS['clustering_eps'], min_samples=ANALYSIS_PARAMS['clustering_min_samples']).fit(embeddings)
233
  self.emotion_clusters.clear()
234
  for idx, label in enumerate(clustering.labels_):