Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -326,19 +326,25 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 326 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
| 327 |
all_emotions = []
|
| 328 |
|
| 329 |
-
# Get embeddings
|
| 330 |
embeddings = []
|
| 331 |
for i, text in enumerate(texts):
|
| 332 |
-
#
|
| 333 |
-
|
| 334 |
chunk_embeddings = []
|
| 335 |
|
| 336 |
-
for
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
chunk_embeddings.append(chunk_embedding)
|
| 339 |
|
| 340 |
-
# Combine
|
| 341 |
-
full_embedding = np.mean(chunk_embeddings, axis=0)
|
| 342 |
embeddings.append(full_embedding)
|
| 343 |
|
| 344 |
progress = (i + 1) / len(texts) * 0.4
|
|
|
|
| 326 |
texts = [clean_arabic_text(poem) for poem in group['poem'].dropna()]
|
| 327 |
all_emotions = []
|
| 328 |
|
| 329 |
+
# Get embeddings while keeping all content
|
| 330 |
embeddings = []
|
| 331 |
for i, text in enumerate(texts):
|
| 332 |
+
# Tokenize the full text first
|
| 333 |
+
full_tokens = bert_tokenizer.tokenize(text)
|
| 334 |
chunk_embeddings = []
|
| 335 |
|
| 336 |
+
# Create chunks of 510 tokens (leaving room for special tokens)
|
| 337 |
+
for start_idx in range(0, len(full_tokens), 510):
|
| 338 |
+
end_idx = start_idx + 510
|
| 339 |
+
chunk_tokens = full_tokens[start_idx:end_idx]
|
| 340 |
+
chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
|
| 341 |
+
|
| 342 |
+
# Get embedding for this chunk
|
| 343 |
+
chunk_embedding = get_embedding_for_text(chunk_text, bert_tokenizer, bert_model)
|
| 344 |
chunk_embeddings.append(chunk_embedding)
|
| 345 |
|
| 346 |
+
# Combine embeddings for full poem representation
|
| 347 |
+
full_embedding = np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(bert_model.config.hidden_size)
|
| 348 |
embeddings.append(full_embedding)
|
| 349 |
|
| 350 |
progress = (i + 1) / len(texts) * 0.4
|