#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics.pairwise import cosine_similarity # measure similarity between two vectors, to assess how alike two data points are from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer converts text data into a matrix of numerical features import os #Importing the os module makes these functions available in your code. from scipy.sparse import coo_matrix import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # Download required resources if missing try: stop_words = set(stopwords.words('english')) except LookupError: nltk.download('stopwords') stop_words = set(stopwords.words('english')) # Punkt tokenizer try: word_tokenize("test sentence") except LookupError: nltk.download('punkt') nltk.download('punkt_tab') # Add this line # In[2]: data = pd.read_csv("marketing_sample_data.tsv", sep = '\t') #seperated # In[3]: data.columns # In[4]: data = data[['Uniq Id','Crawl Timestamp','Product Id', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Name', 'Product Image Url', 'Product Url', 'Product Description','Product Available Inventory', 'Product Tags']] # In[5]: data["Product Tags"] # In[6]: data.shape # In[7]: data["Product Rating"] = data["Product Rating"].fillna(0) data["Product Reviews Count"] = data["Product Reviews Count"].fillna(0) data["Product Category"] = data["Product Category"].fillna('') data["Product Brand"] = data["Product Brand"].fillna('') data["Product Description"] = data["Product Description"].fillna('') data["Product Url"] = data["Product Url"].fillna('') # In[8]: data.isnull().sum() # In[9]: data.duplicated().sum() #no duplicate value in data # In[10]: column_name = { 'Uniq Id': 'ID', 'Crawl Timestamp' :'Time', 'Product Id': 'ProdID', 'Product Rating': 'Rating', 'Product Reviews Count': 'ReviewCount', 'Product Category': 'Category', 'Product Brand': 'Brand', 'Product Name': 'Name', 'Product Image Url': 'ImageURL', 'Product Url': 'ProductURL', 'Product Description': 'Description', 'Product Available Inventory' : 'ProductInventory', 'Product Tags': 'Tags', 'Product Contents': 'Contents' } data.rename(columns=column_name,inplace=True) # In[11]: data['ID'] = data['ID'].str.extract(r'(\d+)').astype(float) data['ProdID'] = data['ProdID'].str.extract(r'(\d+)').astype(float) # In[12]: import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string # Download NLTK data if not already #nltk.download('punkt') #nltk.download('stopwords') stop_words = set(stopwords.words('english')) def clean_and_extract_tags_nltk(text): # Lowercase, tokenize tokens = word_tokenize(text.lower()) # Remove punctuation and stopwords, keep alphanumeric tokens only tags = [token for token in tokens if token.isalnum() and token not in stop_words] return ', '.join(tags) columns_to_extract_tags_from = ['Category', 'Brand', 'Description'] for column in columns_to_extract_tags_from: data[column] = data[column].fillna('') data[column + '_tags'] = data[column].apply(clean_and_extract_tags_nltk) # In[13]: data['Tags'] = data[columns_to_extract_tags_from].apply(lambda row: ', '.join(row), axis=1) # In[14]: average_ratings =data.groupby(['Name', 'ReviewCount', 'Brand', 'ImageURL'])['Rating'].mean().reset_index() top_rated_items = average_ratings.sort_values(by='Rating', ascending=False) rating_base_recommendation = top_rated_items.head(10) rating_base_recommendation.loc[:, 'Rating'] = rating_base_recommendation['Rating'].astype(int) rating_base_recommendation.loc[:, 'ReviewCount'] = rating_base_recommendation['ReviewCount'].astype(int) print("Rating Base Recommendation System: (Trending Products)") rating_base_recommendation[['Name', 'Rating', 'ReviewCount', 'Brand', 'ImageURL']] # In[15]: from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import difflib import pandas as pd # 1. Prepare combined text features column (Name + Category + Brand + Description) data['combined_features'] = ( data['Name'].fillna('') + ' ' + data['Category'].fillna('') + ' ' + data['Brand'].fillna('') + ' ' + data['Description'].fillna('') ) # 2. Create TF-IDF matrix and cosine similarity matrix def create_tfidf_matrix(data, feature_column='combined_features'): tfidf_vectorizer = TfidfVectorizer(stop_words='english') tfidf_matrix = tfidf_vectorizer.fit_transform(data[feature_column]) cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) return tfidf_vectorizer, tfidf_matrix, cosine_sim tfidf_vectorizer, tfidf_matrix, cosine_sim = create_tfidf_matrix(data) # 3. Fuzzy match function for close product names def find_closest_match(name, choices, cutoff=0.6): matches = difflib.get_close_matches(name, choices, n=1, cutoff=cutoff) return matches[0] if matches else None # 5. Content-based recommendation function with fallback to keyword search def content_based_recommendations(data, item_name, cosine_sim, top_n=10): if item_name not in data['Name'].values: close_match = find_closest_match(item_name, data['Name'].tolist()) if close_match: print(f"Item '{item_name}' not found. Did you mean '{close_match}'? Recommending based on '{close_match}'.") item_name = close_match else: print(f"Showing top rated items matching keyword '{item_name}':") return search_top_rated_by_keyword(data, item_name, top_n) item_index = data[data['Name'] == item_name].index[0] similarity_scores = list(enumerate(cosine_sim[item_index])) similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) top_similar = similarity_scores[1:top_n+1] # skip the item itself indices = [i[0] for i in top_similar] return data.iloc[indices][['Name', 'ReviewCount', 'Brand', 'ImageURL','ProductURL', 'Rating']] def search_top_rated_by_keyword(data, keyword, top_n=10): # Try exact substring match first (case-insensitive) filtered = data[data['combined_features'].str.contains(keyword, case=False, na=False)] if filtered.empty: # If no exact substring found, try fuzzy match on unique words in combined_features all_words = set() for text in data['combined_features']: all_words.update(text.lower().split()) close_match = find_closest_match(keyword.lower(), list(all_words)) if close_match: print(f"No exact match found for '{keyword}'. Using closest keyword '{close_match}' for search.") filtered = data[data['combined_features'].str.contains(close_match, case=False, na=False)] else: print(f"No matches found for '{keyword}' or similar keywords.") return pd.DataFrame() # empty result filtered_sorted = filtered.sort_values(by='Rating', ascending=False) return filtered_sorted[['Name', 'ReviewCount', 'Brand', 'ImageURL','ProductURL', 'Rating']].head(top_n) # Usage example: # result = content_based_recommendations(data, "lipstick", cosine_sim, top_n=10) # print(result) # In[16]: import gradio as gr import html import uuid # --- Clean image slider function --- def build_image_slider(image_urls, slider_id): total = len(image_urls) if total == 1: return f'''