Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- eda.py +12 -0
- eda_functions.py +117 -0
eda.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from src.utils.eda_functions import process_text, generate_word_clouds_by_category
|
| 3 |
+
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
|
| 4 |
+
train = dataset["train"].to_pandas()
|
| 5 |
+
test = dataset["test"].to_pandas()
|
| 6 |
+
# train["processed_quote"] = train["quote"].apply(process_text)
|
| 7 |
+
|
| 8 |
+
train.to_csv("outputs/train_v1.csv", sep=";", index=False)
|
| 9 |
+
test.to_csv("outputs/test.csv", sep=";", index=False)
|
| 10 |
+
|
| 11 |
+
# Generate word clouds
|
| 12 |
+
# generate_word_clouds_by_category(train)
|
eda_functions.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from nltk.corpus import stopwords
|
| 5 |
+
from wordcloud import WordCloud
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
import spacy
|
| 9 |
+
import string
|
| 10 |
+
from sklearn.decomposition import PCA
|
| 11 |
+
from sklearn.manifold import TSNE
|
| 12 |
+
|
| 13 |
+
# Load the spaCy model for English
|
| 14 |
+
nlp = spacy.load("en_core_web_sm")
|
| 15 |
+
|
| 16 |
+
# Get English stop words from NLTK
|
| 17 |
+
stop_words = set(stopwords.words("english"))
|
| 18 |
+
|
| 19 |
+
def process_text(text):
|
| 20 |
+
"""
|
| 21 |
+
Process text by:
|
| 22 |
+
1. Lowercasing
|
| 23 |
+
2. Removing punctuation and non-alphanumeric characters
|
| 24 |
+
3. Removing stop words
|
| 25 |
+
4. Lemmatization
|
| 26 |
+
"""
|
| 27 |
+
# Step 1: Tokenization & Processing with spaCy
|
| 28 |
+
doc = nlp(text.lower()) # Process text with spaCy
|
| 29 |
+
|
| 30 |
+
# Step 2: Filter out stop words, non-alphanumeric characters, punctuation, and apply lemmatization
|
| 31 |
+
processed_tokens = [
|
| 32 |
+
re.sub(r'[^a-zA-Z0-9]', '', token.lemma_) # Remove non-alphanumeric characters
|
| 33 |
+
for token in doc
|
| 34 |
+
if token.text not in stop_words and token.text not in string.punctuation
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
# Optional: Filter out empty strings resulting from the regex replacement
|
| 38 |
+
processed_tokens = [word for word in processed_tokens if word]
|
| 39 |
+
|
| 40 |
+
return processed_tokens
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def generate_word_clouds_by_category(df, output_dir="wordclouds"):
|
| 45 |
+
"""
|
| 46 |
+
Generates and saves word clouds for each category in the DataFrame.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
df (pd.DataFrame): DataFrame with 'processed_text' and 'category' columns.
|
| 50 |
+
output_dir (str): Directory to save the word cloud images.
|
| 51 |
+
"""
|
| 52 |
+
# Create output directory if it doesn't exist
|
| 53 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 54 |
+
|
| 55 |
+
# Group words by category
|
| 56 |
+
category_word_map = defaultdict(list)
|
| 57 |
+
for _, row in df.iterrows():
|
| 58 |
+
category_word_map[row["label"]].extend(row["processed_quote"])
|
| 59 |
+
|
| 60 |
+
# Generate and save word clouds
|
| 61 |
+
for category, words in category_word_map.items():
|
| 62 |
+
word_cloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(words))
|
| 63 |
+
|
| 64 |
+
# Plot and save the word cloud
|
| 65 |
+
plt.figure(figsize=(10, 5))
|
| 66 |
+
plt.imshow(word_cloud, interpolation='bilinear')
|
| 67 |
+
plt.axis('off')
|
| 68 |
+
plt.title(category)
|
| 69 |
+
|
| 70 |
+
# Save the plot as an image
|
| 71 |
+
filename = os.path.join(output_dir, f"{category.replace(' ', '_').lower()}_wordcloud.png")
|
| 72 |
+
plt.savefig(filename, bbox_inches='tight')
|
| 73 |
+
print(f"Word cloud saved for category '{category}' at {filename}")
|
| 74 |
+
|
| 75 |
+
plt.close() # Close the figure to avoid memory issues
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def plot_embeddings_2d(df, embedding_column, label_column, method='PCA', random_state=42):
|
| 81 |
+
"""
|
| 82 |
+
This function reduces high-dimensional embeddings into 2D and visualizes them.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
df (pd.DataFrame): DataFrame containing the text data and labels.
|
| 86 |
+
embedding_column (str): The column containing the embeddings.
|
| 87 |
+
label_column (str): The column containing the labels.
|
| 88 |
+
method (str): The dimensionality reduction method ('PCA' or 'tSNE').
|
| 89 |
+
random_state (int): Random state for reproducibility.
|
| 90 |
+
"""
|
| 91 |
+
# Step 1: Use dimensionality reduction (PCA or t-SNE)
|
| 92 |
+
if method == 'PCA':
|
| 93 |
+
reducer = PCA(n_components=2, random_state=random_state)
|
| 94 |
+
elif method == 'tSNE':
|
| 95 |
+
reducer = TSNE(n_components=2, random_state=random_state)
|
| 96 |
+
else:
|
| 97 |
+
raise ValueError("Invalid method. Use 'PCA' or 'tSNE'.")
|
| 98 |
+
|
| 99 |
+
# Reduce the embeddings to 2D
|
| 100 |
+
embeddings_2d = reducer.fit_transform(df[embedding_column].tolist())
|
| 101 |
+
|
| 102 |
+
# Step 2: Plot the 2D embeddings
|
| 103 |
+
plt.figure(figsize=(10, 8))
|
| 104 |
+
|
| 105 |
+
# Scatter plot, coloring points by their label
|
| 106 |
+
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df[label_column], cmap='viridis')
|
| 107 |
+
|
| 108 |
+
# Create a legend for the classes (labels)
|
| 109 |
+
plt.legend(*scatter.legend_elements(), title="Classes")
|
| 110 |
+
|
| 111 |
+
# Adding labels and title
|
| 112 |
+
plt.title("2D Visualization of Embeddings")
|
| 113 |
+
plt.xlabel("Principal Component 1" if method == 'PCA' else "t-SNE Dimension 1")
|
| 114 |
+
plt.ylabel("Principal Component 2" if method == 'PCA' else "t-SNE Dimension 2")
|
| 115 |
+
|
| 116 |
+
plt.colorbar(scatter)
|
| 117 |
+
plt.show()
|