Day28-Transformer-Explorer / analysis_engine.py
tiheli's picture
Upload 4 files
945614f verified
from transformers import AutoTokenizer, AutoModel
import torch
from functools import lru_cache
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
@lru_cache(maxsize=4)
def get_model_and_tokenizer(model_name):
"""Loads a model and tokenizer from Hugging Face, caching the result for performance."""
print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
return model, tokenizer
def analyze_sentence(model_name, sentence):
"""
Analyzes a sentence using a specified transformer model and extracts internal states.
"""
if not sentence or not model_name:
return None
model, tokenizer = get_model_and_tokenizer(model_name)
# Convert sentence to numerical IDs for the model.
inputs = tokenizer(sentence, return_tensors="pt")
# Disable gradient calculations for inference mode to save memory.
with torch.no_grad():
# Request attentions and hidden states from the model.
outputs = model(**inputs, output_hidden_states=True, output_attentions=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# Extract the tuples of tensors for each layer.
attention_weights = outputs.attentions
hidden_states = outputs.hidden_states
analysis_data = {
"tokens": tokens,
"attention": attention_weights,
"hidden_states": hidden_states
}
return analysis_data
def find_closest_words(embeddings, tokens):
"""
Finds the two most semantically similar words in a sentence based on their embeddings.
"""
# Ignore special tokens and punctuation for a cleaner analysis.
ignore_list = ["[CLS]", "[SEP]", ".", ",", "?", "!"]
valid_indices = [i for i, token in enumerate(tokens) if token not in ignore_list]
if len(valid_indices) < 2:
return "Anlamsal yakınlık analizi için yeterli kelime bulunamadı."
valid_embeddings = embeddings[valid_indices]
valid_tokens = [tokens[i] for i in valid_indices]
# Calculate the cosine similarity matrix between all valid words.
similarity_matrix = cosine_similarity(valid_embeddings)
# Fill the diagonal with a low value to ignore self-similarity.
np.fill_diagonal(similarity_matrix, -1)
# Find the index of the highest similarity score.
max_idx = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
word1 = valid_tokens[max_idx[0]]
word2 = valid_tokens[max_idx[1]]
similarity_score = similarity_matrix[max_idx]
return f"💡 **Dinamik Analiz:** Model, bu cümlede anlamsal olarak birbirine en yakın iki kelimeyi **'{word1}'** ve **'{word2}'** olarak belirledi (Benzerlik Skoru: {similarity_score:.2f})."