Spaces:
Build error
Build error
| """ | |
| Updated bow_analysis.py to include similarity metrics. | |
| Preprocessing here is more advanced than n-gram version. | |
| Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization. | |
| """ | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.tokenize import word_tokenize | |
| from processors.metrics import calculate_similarity | |
| # not used currently imports, but left in case I start using them again | |
| import numpy as np | |
| from collections import Counter | |
| import re | |
| import nltk | |
| # Define the compare_bow_across_texts function directly in this file | |
| def compare_bow_across_texts(texts, model_names, top_n=25): | |
| """ | |
| Compare bag of words representations across multiple texts. | |
| Args: | |
| texts (list): List of text responses to compare | |
| model_names (list): Names of models corresponding to responses | |
| top_n (int): Number of top words to consider | |
| Returns: | |
| dict: Bag of words analysis results | |
| """ | |
| # Initialize the results dictionary | |
| result = { | |
| "models": model_names, | |
| "important_words": {}, | |
| "word_count_matrix": {}, | |
| "differential_words": [] | |
| } | |
| # Make sure we have texts to analyze | |
| if not texts or len(texts) < 1: | |
| return result | |
| # Preprocess texts (tokenize, remove stopwords, etc.) | |
| preprocessed_texts = [] | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| for text in texts: | |
| # Convert to lowercase and tokenize | |
| tokens = word_tokenize(text.lower()) | |
| # Remove stopwords, punctuation, and lemmatize | |
| filtered_tokens = [] | |
| for token in tokens: | |
| if token.isalpha() and token not in stop_words and len(token) > 2: | |
| filtered_tokens.append(lemmatizer.lemmatize(token)) | |
| preprocessed_texts.append(" ".join(filtered_tokens)) | |
| # Create bag of words representations using CountVectorizer | |
| vectorizer = CountVectorizer(max_features=1000) | |
| X = vectorizer.fit_transform(preprocessed_texts) | |
| # Get feature names (words) | |
| feature_names = vectorizer.get_feature_names_out() | |
| # Create word count matrix | |
| word_counts = {} | |
| for i, model in enumerate(model_names): | |
| counts = X[i].toarray()[0] | |
| word_counts[model] = {} | |
| # Store word frequencies for this model | |
| for j, word in enumerate(feature_names): | |
| if counts[j] > 0: # Only store words that appear | |
| word_counts[model][word] = int(counts[j]) | |
| # Add to word count matrix | |
| if word not in result["word_count_matrix"]: | |
| result["word_count_matrix"][word] = {} | |
| result["word_count_matrix"][word][model] = int(counts[j]) | |
| # Find important words for each model | |
| for model, word_freq in word_counts.items(): | |
| # Sort by frequency | |
| sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) | |
| # Store top N words | |
| result["important_words"][model] = [ | |
| {"word": word, "count": count} | |
| for word, count in sorted_words[:top_n] | |
| ] | |
| # Calculate differential words (words with biggest frequency difference between models) | |
| if len(model_names) >= 2: | |
| model1, model2 = model_names[0], model_names[1] | |
| # Calculate differences | |
| diff_scores = {} | |
| for word in result["word_count_matrix"]: | |
| count1 = result["word_count_matrix"][word].get(model1, 0) | |
| count2 = result["word_count_matrix"][word].get(model2, 0) | |
| # Absolute difference | |
| diff_scores[word] = abs(count1 - count2) | |
| # Sort by difference | |
| sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True) | |
| result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]] | |
| # Calculate overlap statistics | |
| model1_words = set(word_counts.get(model1, {}).keys()) | |
| model2_words = set(word_counts.get(model2, {}).keys()) | |
| common_words = model1_words.intersection(model2_words) | |
| # Initialize comparisons if needed | |
| if "comparisons" not in result: | |
| result["comparisons"] = {} | |
| comparison_key = f"{model1} vs {model2}" | |
| result["comparisons"][comparison_key] = { | |
| "common_word_count": len(common_words) | |
| } | |
| return result | |
| def add_similarity_metrics(bow_results, response_texts, model_names): | |
| """ | |
| Add similarity metrics to the bag of words analysis results | |
| Args: | |
| bow_results (dict): The bag of words analysis results | |
| response_texts (list): List of response texts to compare | |
| model_names (list): List of model names corresponding to responses | |
| Returns: | |
| dict: Updated bag of words results with similarity metrics | |
| """ | |
| # Make sure we have at least two responses to compare | |
| if len(response_texts) < 2 or len(model_names) < 2: | |
| print("Need at least two responses to calculate similarity metrics") | |
| return bow_results | |
| # Get the first two responses (current implementation only handles two-way comparisons) | |
| text1, text2 = response_texts[0], response_texts[1] | |
| model1, model2 = model_names[0], model_names[1] | |
| # Generate the comparison key | |
| comparison_key = f"{model1} vs {model2}" | |
| # Initialize comparisons if needed | |
| if "comparisons" not in bow_results: | |
| bow_results["comparisons"] = {} | |
| # Initialize the comparison entry if needed | |
| if comparison_key not in bow_results["comparisons"]: | |
| bow_results["comparisons"][comparison_key] = {} | |
| # Calculate similarity metrics | |
| metrics = calculate_similarity(text1, text2) | |
| # Add metrics to the comparison | |
| bow_results["comparisons"][comparison_key].update({ | |
| "cosine_similarity": metrics.get("cosine_similarity", 0), | |
| "jaccard_similarity": metrics.get("jaccard_similarity", 0), | |
| "semantic_similarity": metrics.get("semantic_similarity", 0) | |
| }) | |
| # If we have common_word_count from BOW analysis, keep it | |
| if "common_word_count" not in bow_results["comparisons"][comparison_key]: | |
| # Calculate from bow data as a fallback | |
| if "important_words" in bow_results: | |
| words1 = set([item["word"] for item in bow_results["important_words"].get(model1, [])]) | |
| words2 = set([item["word"] for item in bow_results["important_words"].get(model2, [])]) | |
| common_words = words1.intersection(words2) | |
| bow_results["comparisons"][comparison_key]["common_word_count"] = len(common_words) | |
| return bow_results | |
| def compare_bow(texts, model_names, top_n=25): | |
| """ | |
| Compare bag of words between different texts | |
| Args: | |
| texts (list): List of text responses to compare | |
| model_names (list): Names of models corresponding to responses | |
| top_n (int): Number of top words to consider | |
| Returns: | |
| dict: Comparative analysis | |
| """ | |
| bow_results = compare_bow_across_texts(texts, model_names, top_n) | |
| # Add similarity metrics to the results | |
| if len(texts) >= 2 and len(model_names) >= 2: | |
| bow_results = add_similarity_metrics(bow_results, texts, model_names) | |
| return bow_results |