Spaces:

omerariel123
/

DataScienceAssignment3

Sleeping

App Files Files Community

omerariel123 commited on Jul 16, 2025

Commit

7aa6e48

verified ·

1 Parent(s): 56168b8

Upload taylor's_version_data_science_–_assignment_3_(group_project).py

Browse files

Files changed (1) hide show

taylor's_version_data_science_–_assignment_3_(group_project).py +382 -0

taylor's_version_data_science_–_assignment_3_(group_project).py ADDED Viewed

	@@ -0,0 +1,382 @@

+# -*- coding: utf-8 -*-
+"""Taylor's Version Data Science – Assignment 3 (Group Project).ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1UINZIWnD3atFdcs5TWUxZ_Af7c5gLhrZ
+"""
+!pip install -q datasets huggingface_hub
+!pip install -U datasets
+from google.colab import userdata
+from huggingface_hub import login
+hf_token = userdata.get("HF_TOKEN")  # מושך את הטוקן השמור בסיקרט
+login(hf_token)
+import pandas as pd
+splits = {'train': 'train_clean.csv', 'validation': 'val_clean.csv', 'test': 'test_clean.csv'}
+df = pd.read_csv("hf://datasets/hita/social-behavior-emotions/" + splits["train"])
+df.head(10)
+# מנקה את שם העמודה עם שגיאת הכתיב (אם צריך)
+df = df.rename(columns={"lable": "Label"})
+# הצגת מידע כללי על הדאטאסט
+print("=== Dataset Info ===")
+print(df.info())
+print("\n=== Missing Values ===")
+print(df.isnull().sum())
+print("\n=== Duplicate Rows ===")
+print(df.duplicated().sum())
+print("\n=== Label Distribution ===")
+print(df['Label'].value_counts())
+# גרף התפלגות תוויות
+import matplotlib.pyplot as plt
+import seaborn as sns
+plt.figure(figsize=(8, 5))
+sns.countplot(data=df, x='Label', order=df['Label'].value_counts().index)
+plt.title("Distribution of Emotion Labels")
+plt.xlabel("Emotion")
+plt.ylabel("Count")
+plt.xticks(rotation=45)
+plt.grid(axis="y", linestyle="--", alpha=0.6)
+plt.tight_layout()
+plt.show()
+# חישוב אורך טקסט במילים
+df['text_length'] = df['CleanReview'].apply(lambda x: len(str(x).split()))
+plt.figure(figsize=(8, 5))
+sns.histplot(df['text_length'], bins=30, kde=True)
+plt.title("Distribution of Text Lengths (Word Count)")
+plt.xlabel("Number of Words")
+plt.ylabel("Frequency")
+plt.tight_layout()
+plt.show()
+from collections import Counter
+import matplotlib.pyplot as plt
+import seaborn as sns
+# מפרק את כל הטקסטים למילים (ללא NLTK)
+all_words = " ".join(df['CleanReview'].dropna()).lower().split()
+word_freq = Counter(all_words)
+# מציג את 20 המילים הכי נפוצות
+top_words = word_freq.most_common(20)
+words, counts = zip(*top_words)
+# גרף
+plt.figure(figsize=(10, 5))
+sns.barplot(x=list(counts), y=list(words))
+plt.title("Top 20 Most Common Words")
+plt.xlabel("Frequency")
+plt.ylabel("Word")
+plt.tight_layout()
+plt.show()
+from collections import Counter
+# מחשב את מספר המילים בכל שורה
+word_counts = df['CleanReview'].dropna().apply(lambda x: len(str(x).split()))
+# סופר כמה שורות יש לכל מספר מילים
+length_distribution = Counter(word_counts)
+# ממיר ל-DataFrame מסודר
+import pandas as pd
+length_df = pd.DataFrame(length_distribution.items(), columns=["Word Count", "Sentence Count"])
+length_df = length_df.sort_values(by="Word Count").reset_index(drop=True)
+# מציג
+print(length_df)
+df = df[df['text_length'] > 1].reset_index(drop=True)
+from collections import Counter
+# מחשב את מספר המילים בכל שורה
+word_counts = df['CleanReview'].dropna().apply(lambda x: len(str(x).split()))
+# סופר כמה שורות יש לכל מספר מילים
+length_distribution = Counter(word_counts)
+# ממיר ל-DataFrame מסודר
+import pandas as pd
+length_df = pd.DataFrame(length_distribution.items(), columns=["Word Count", "Sentence Count"])
+length_df = length_df.sort_values(by="Word Count").reset_index(drop=True)
+# מציג
+print(length_df)
+from collections import Counter
+import matplotlib.pyplot as plt
+import seaborn as sns
+from wordcloud import WordCloud
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+# 1. הסרת שורות כפולות
+df = df.drop_duplicates()
+print(f"✅ DataFrame shape after dropping duplicates: {df.shape}")
+# 2. ניתוח אורך טקסט לפי רגש
+sns.boxplot(x='Label', y='text_length', data=df)
+plt.title("Text Length by Emotion")
+plt.xlabel("Emotion")
+plt.ylabel("Number of Words")
+plt.tight_layout()
+plt.show()
+# 3. WordCloud לכל רגש
+emotions = df['Label'].unique()
+for emotion in emotions:
+    text = " ".join(df[df['Label'] == emotion]['CleanReview'].dropna()).lower()
+    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis("off")
+    plt.title(f"WordCloud for '{emotion}'")
+    plt.show()
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import seaborn as sns
+# 1. מגדיר רשימת stopwords מותאמת
+custom_stopwords = ENGLISH_STOP_WORDS.union({'feel', 'know', 'make', 'think', 'time','really','want'})
+# 2. WordCloud לפי כל רגש (ללא stopwords)
+print("✅ WordClouds per emotion (without stopwords):")
+emotions = df['Label'].unique()
+for emotion in emotions:
+    text = " ".join(df[df['Label'] == emotion]['CleanReview'].dropna()).lower()
+    words = [word for word in text.split() if word not in custom_stopwords]
+    filtered_text = " ".join(words)
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis("off")
+    plt.title(f"WordCloud for '{emotion}' (No Stopwords)")
+    plt.show()
+# 3. נרמול רגשות (undersampling)
+min_count = df['Label'].value_counts().min()
+df_balanced = df.groupby('Label').apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)
+print("\n✅ New label distribution after balancing:")
+print(df_balanced['Label'].value_counts())
+"""# **2. Embeddings**"""
+!pip install -q datasets sentence-transformers umap-learn scikit-learn matplotlib pandas
+!pip install -q scikit-learn umap-learn
+# שלב 2: יצירת embeddings
+from sentence_transformers import SentenceTransformer
+texts = list(df["CleanReview"])
+labels = df["Label"]
+label_to_id = {label: idx for idx, label in enumerate(set(labels))}
+true_ids = [label_to_id[label] for label in labels]
+model = SentenceTransformer("j-hartmann/emotion-english-distilroberta-base")
+embeddings = model.encode(texts, show_progress_bar=True)
+import umap.umap_ as umap
+# הפלה למרחב דו-ממדי (ויזואליזציה בלבד)
+reducer = umap.UMAP(n_components=2, random_state=42)
+embeddings_2d = reducer.fit_transform(embeddings)
+from sklearn.cluster import KMeans
+from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
+# KMeans
+kmeans = KMeans(n_clusters=len(set(labels)), random_state=42)
+kmeans_labels = kmeans.fit_predict(embeddings)
+# מדדי השוואה
+ari_kmeans = adjusted_rand_score(true_ids, kmeans_labels)
+nmi_kmeans = normalized_mutual_info_score(true_ids, kmeans_labels)
+# הדפסת תוצאות
+print(f"KMeans:\n - ARI: {ari_kmeans:.3f}\n - NMI: {nmi_kmeans:.3f}")
+from sklearn.mixture import GaussianMixture
+from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
+# הגדרת מספר הקלאסטרים (כמו מספר הרגשות)
+n_components = len(set(true_ids))
+# הרצת GMM
+gmm = GaussianMixture(n_components=n_components, random_state=42)
+gmm_labels = gmm.fit_predict(embeddings)
+# מדדי התאמה
+ari_gmm = adjusted_rand_score(true_ids, gmm_labels)
+nmi_gmm = normalized_mutual_info_score(true_ids, gmm_labels)
+print(f"GMM:\n - ARI: {ari_gmm:.3f}\n - NMI: {nmi_gmm:.3f}")
+import matplotlib.pyplot as plt
+import pandas as pd
+df_plot = pd.DataFrame(embeddings_2d, columns=["x", "y"])
+df_plot["true"] = true_ids
+df_plot["kmeans"] = kmeans_labels
+df_plot["gmm"] = gmm_labels
+fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+# כל גרף מקבל צבעים רק מהעמודה הרלוונטית
+scatter0 = axes[0].scatter(df_plot["x"], df_plot["y"], c=df_plot["true"], cmap="tab10", alpha=0.6)
+axes[0].set_title("True Labels")
+scatter1 = axes[1].scatter(df_plot["x"], df_plot["y"], c=df_plot["kmeans"], cmap="tab10", alpha=0.6)
+axes[1].set_title("KMeans (mpnet)")
+scatter2 = axes[2].scatter(df_plot["x"], df_plot["y"], c=df_plot["gmm"], cmap="tab10", alpha=0.6)
+axes[2].set_title("GMM (mpnet)")
+plt.tight_layout()
+plt.show()
+from collections import Counter
+cluster_to_label = {}
+for cluster_id in range(gmm.n_components):
+    # מוצא את כל האינדקסים של הדוגמאות ששייכות לקלאסטר הזה
+    indices = [i for i, c in enumerate(gmm_labels) if c == cluster_id]
+    # מוצא את הרגש הכי נפוץ בתוכם
+    majority_label_id = Counter([true_ids[i] for i in indices]).most_common(1)[0][0]
+    # שומר במילון
+    cluster_to_label[cluster_id] = majority_label_id
+# מילון שמתאים קלאסטר לרגש
+id_to_label = {v: k for k, v in label_to_id.items()}
+cluster_to_emotion = {cluster: id_to_label[label_id] for cluster, label_id in cluster_to_label.items()}
+print("מיפוי קלאסטרים → רגשות:")
+print(cluster_to_emotion)
+import joblib
+# שמירת מודל ה-embedding (שם המודל עצמו)
+embedding_model_name = "j-hartmann/emotion-english-distilroberta-base"
+# שמירת מודל ה-GMM
+joblib.dump(gmm, "gmm_model.pkl")
+# שמירת מיפוי קלאסטר לרגש
+import json
+with open("cluster_to_emotion.json", "w") as f:
+    json.dump(cluster_to_emotion, f)
+"""# 3. Inputs & outputs."""
+!pip install -q transformers datasets gradio sentencepiece
+!pip install -q sentence-transformers gradio
+"""song list & lyrics dataset setup"""
+import pandas as pd
+song_db = pd.read_parquet("hf://datasets/johanf/taylor-swift/data/train-00000-of-00001.parquet")
+# Keep only needed columns and drop rows with missing values
+song_db = song_db[["lyrics", "title"]].dropna().drop_duplicates()
+# Optional: remove extra whitespace
+song_db["lyrics"] = song_db["lyrics"].str.strip()
+song_db["title"] = song_db["title"].str.strip()
+# Reset index for convenience
+song_db = song_db.reset_index(drop=True)
+# Preview
+song_db.head()
+from sentence_transformers import SentenceTransformer
+# Load a pre-trained model for semantic similarity
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Compute embeddings for lyrics (takes ~10–20 sec)
+lyrics_list = song_db["lyrics"].tolist()
+lyrics_embeddings = embedding_model.encode(lyrics_list, show_progress_bar=True)
+"""Function that finds the best-fitting song
+The find_matching_song function uses a semantic similarity approach to match user input with Taylor Swift lyrics. First, it transforms the user's sentence into a numerical vector using a pre-trained sentence embedding model (all-MiniLM-L6-v2). Then, it compares this vector to pre-computed embeddings of all song lyrics using cosine similarity, which measures how close the meanings are in semantic space. Finally, it ranks the songs by similarity and returns the top matches, showing their titles, a short lyric snippet, and a similarity score.
+"""
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+def find_matching_song(user_input, top_k=3):
+    # Encode the user input
+    user_embedding = embedding_model.encode([user_input])
+    # Compute cosine similarity with all lyrics
+    similarities = cosine_similarity(user_embedding, lyrics_embeddings)[0]
+    # Find top matches
+    top_indices = np.argsort(similarities)[-top_k:][::-1]
+    # Build output list
+    results = []
+    for idx in top_indices:
+        title = song_db.loc[idx, "title"]
+        lyric_snippet = song_db.loc[idx, "lyrics"][:200].replace("\n", " ") + "..."
+        score = similarities[idx]
+        results.append(f"🎵 **{title}**  —  _match: {score:.2f}_  \n`{lyric_snippet}`")
+    return "\n\n".join(results)
+"""the app"""
+import gradio as gr
+demo = gr.Interface(
+    fn=find_matching_song,
+    inputs=gr.Textbox(
+        lines=2,
+        placeholder="What's on your mind? (e.g. I had ice cream today!)",
+        label="Enter your thought or mood"
+    ),
+    outputs=gr.Markdown(),
+    title="🎤 Taylor Swift Lyric Matcher",
+    description="Enter a sentence, and get the Taylor Swift song with lyrics most semantically similar.",
+    examples=[
+        ["I just broke up with my boyfriend"],
+        ["I feel nostalgic about my childhood"],
+        ["I got revenge on someone"],
+        ["I'm in love and the world feels perfect"],
+        ["I had ice cream today!"]
+    ]
+)
+#app launch
+demo.launch(share=True)