Initial Commit
Browse files- .gitattributes +5 -0
- about_me.py +162 -0
- base.py +603 -0
- collaborative_item.py +211 -0
- collaborative_user.py +235 -0
- content_based.py +157 -0
- data/cache/svd_incremental_cache.joblib +3 -0
- data/grouped_tags.csv +3 -0
- data/links.csv +0 -0
- data/merger.csv +3 -0
- data/movies.csv +0 -0
- data/ratings.csv +3 -0
- data/tags.csv +3 -0
- item_based.py +49 -0
- main.py +16 -0
- movies_final.csv +3 -0
- nltk_data/corpora/stopwords.zip +3 -0
- nltk_data/corpora/stopwords/english +198 -0
- nltk_data/corpora/wordnet.zip +3 -0
- nltk_data/tokenizers/punkt.zip +3 -0
- nltk_data/tokenizers/punkt/.DS_Store +0 -0
- nltk_data/tokenizers/punkt/PY3/README +98 -0
- nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
- nltk_data/tokenizers/punkt/README +98 -0
- nltk_data/tokenizers/punkt/english.pickle +3 -0
- popularity.py +129 -0
- requirements.txt +10 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/grouped_tags.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/merger.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/ratings.csv filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/tags.csv filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
movies_final.csv filter=lfs diff=lfs merge=lfs -text
|
about_me.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
#===============================
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Define the data directory outside the function
|
| 10 |
+
data_dir = Path(__file__).parent / 'data'
|
| 11 |
+
|
| 12 |
+
# 1. Use st.cache_data to run this function only once
|
| 13 |
+
@st.cache_data
|
| 14 |
+
def load_ratings_data():
|
| 15 |
+
chunks = []
|
| 16 |
+
# Using nullable integers ('Int32') is correct
|
| 17 |
+
dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
|
| 18 |
+
|
| 19 |
+
# Add a spinner to show the user the file is loading
|
| 20 |
+
with st.spinner('Loading ratings data (please bear with me)...'):
|
| 21 |
+
for chunk in pd.read_csv(
|
| 22 |
+
data_dir / 'ratings.csv',
|
| 23 |
+
dtype=dtype_ratings,
|
| 24 |
+
chunksize=1_000_000
|
| 25 |
+
):
|
| 26 |
+
# Drop rows where 'userId' or 'movieId' is NaN (as requested)
|
| 27 |
+
chunk = chunk.dropna(subset=['userId', 'movieId'])
|
| 28 |
+
chunks.append(chunk)
|
| 29 |
+
|
| 30 |
+
# Concatenate all chunks into the final dataframe
|
| 31 |
+
ratings = pd.concat(chunks, ignore_index=True)
|
| 32 |
+
return ratings
|
| 33 |
+
|
| 34 |
+
# 2. Call the function to load and cache the data.
|
| 35 |
+
# This DataFrame is now available globally or can be passed to functions.
|
| 36 |
+
ratings_df = load_ratings_data()
|
| 37 |
+
st.session_state['ratings_df'] = ratings_df # Use session_state to share it easily
|
| 38 |
+
|
| 39 |
+
#==============================
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Basic page styling
|
| 44 |
+
st.markdown(
|
| 45 |
+
"""
|
| 46 |
+
<style>
|
| 47 |
+
.stApp {
|
| 48 |
+
background-color: white;
|
| 49 |
+
color: #116A91;
|
| 50 |
+
}
|
| 51 |
+
.title {
|
| 52 |
+
text-align: center;
|
| 53 |
+
color: #116A91;
|
| 54 |
+
font-size: 36px;
|
| 55 |
+
margin-bottom: 10px;
|
| 56 |
+
}
|
| 57 |
+
.subtitle {
|
| 58 |
+
text-align: center;
|
| 59 |
+
color: #444444;
|
| 60 |
+
font-size: 18px;
|
| 61 |
+
margin-bottom: 30px;
|
| 62 |
+
}
|
| 63 |
+
.section {
|
| 64 |
+
background-color: #f9f9f9;
|
| 65 |
+
padding: 20px;
|
| 66 |
+
margin-bottom: 20px;
|
| 67 |
+
border-radius: 6px;
|
| 68 |
+
}
|
| 69 |
+
.header {
|
| 70 |
+
color: #116A91;
|
| 71 |
+
font-size: 20px;
|
| 72 |
+
margin-bottom: 10px;
|
| 73 |
+
}
|
| 74 |
+
.content {
|
| 75 |
+
color: #333333;
|
| 76 |
+
font-size: 16px;
|
| 77 |
+
line-height: 1.6;
|
| 78 |
+
}
|
| 79 |
+
</style>
|
| 80 |
+
""", unsafe_allow_html=True
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Page Title and Subtitle
|
| 84 |
+
st.markdown("<div class='title'>Movie Recommender System</div>", unsafe_allow_html=True)
|
| 85 |
+
st.markdown("<div class='subtitle'>Discover movies you'll love using different recommendation techniques 🎬</div>", unsafe_allow_html=True)
|
| 86 |
+
|
| 87 |
+
# Creator Section
|
| 88 |
+
st.markdown("""
|
| 89 |
+
<div class='section'>
|
| 90 |
+
<div class='header'>👤 About the Creator</div>
|
| 91 |
+
<div class='content'>
|
| 92 |
+
<strong>SJ</strong> is the creator of this Movie Recommender System.
|
| 93 |
+
<br><br>
|
| 94 |
+
Currently working as a <strong>Product Manager</strong>, SJ specializes in leading cross-functional teams to build and launch user-centric digital products. With a strong foundation in both business strategy and data, SJ is passionate about solving real-world problems through thoughtful product design.
|
| 95 |
+
<br><br>
|
| 96 |
+
Prior to becoming a Product Manager, SJ held roles as:
|
| 97 |
+
<ul>
|
| 98 |
+
<li><strong>Data Analyst</strong> – Experienced in SQL, Excel, Tableau, Python, Consumer Bureau data, and Microfinance analytics</li>
|
| 99 |
+
<li><strong>Data Engineer</strong> – Worked with IBM DataStage, Informatica, Ataccama, and Collibra</li>
|
| 100 |
+
</ul>
|
| 101 |
+
This diverse background is what inspired SJ to build tools like this — combining the power of data with simple, intuitive products.
|
| 102 |
+
</div>
|
| 103 |
+
</div>
|
| 104 |
+
""", unsafe_allow_html=True)
|
| 105 |
+
|
| 106 |
+
# Purpose Section
|
| 107 |
+
st.markdown("""
|
| 108 |
+
<div class='section'>
|
| 109 |
+
<div class='header'>📌 Purpose of This App</div>
|
| 110 |
+
<div class='content'>
|
| 111 |
+
This Movie Recommender System helps users discover movies they might enjoy based on different recommendation techniques:
|
| 112 |
+
<ul>
|
| 113 |
+
<li>🔥 Popularity Based Filtering – Recommend trending movies</li>
|
| 114 |
+
<li>🎬 Content-Based Filtering – Recommend movies similar to your favorites</li>
|
| 115 |
+
<li>👥 Collaborative User-Based – Recommend movies liked by users similar to you</li>
|
| 116 |
+
<li>🛍️ Collaborative Item-Based – Recommend movies similar to ones you've rated highly</li>
|
| 117 |
+
</ul>
|
| 118 |
+
The goal is to provide a simple, interactive way to explore movie recommendations tailored to your preferences.
|
| 119 |
+
</div>
|
| 120 |
+
</div>
|
| 121 |
+
""", unsafe_allow_html=True)
|
| 122 |
+
|
| 123 |
+
# Technology Section
|
| 124 |
+
st.markdown("""
|
| 125 |
+
<div class='section'>
|
| 126 |
+
<div class='header'>🛠️ Technology Used</div>
|
| 127 |
+
<div class='content'>
|
| 128 |
+
The application is developed using <strong>Streamlit</strong>, a lightweight and efficient Python framework for building data apps.
|
| 129 |
+
<br><br>
|
| 130 |
+
Key Python libraries used include <strong>Pandas</strong>, <strong>NumPy</strong>, <strong>Scikit-Learn</strong>, and <strong>Scipy</strong> for data processing and recommendation algorithms.
|
| 131 |
+
</div>
|
| 132 |
+
</div>
|
| 133 |
+
""", unsafe_allow_html=True)
|
| 134 |
+
|
| 135 |
+
# Features Section
|
| 136 |
+
st.markdown("""
|
| 137 |
+
<div class='section'>
|
| 138 |
+
<div class='header'>🔍 What You Can Do with It</div>
|
| 139 |
+
<div class='content'>
|
| 140 |
+
With this app, users can:
|
| 141 |
+
<ul>
|
| 142 |
+
<li>Explore movie recommendations based on popularity, content similarity, or collaborative filtering</li>
|
| 143 |
+
<li>View detailed recommendations for a specific user</li>
|
| 144 |
+
<li>Filter movies based on ratings and user activity</li>
|
| 145 |
+
<li>Understand hidden patterns in user preferences using latent features</li>
|
| 146 |
+
</ul>
|
| 147 |
+
The system helps both casual viewers and movie enthusiasts find movies they are likely to enjoy.
|
| 148 |
+
</div>
|
| 149 |
+
</div>
|
| 150 |
+
""", unsafe_allow_html=True)
|
| 151 |
+
|
| 152 |
+
# Footer / Call-to-action
|
| 153 |
+
st.markdown("""
|
| 154 |
+
<div class='section'>
|
| 155 |
+
<div class='header'>🎯 Ready to Discover Movies?</div>
|
| 156 |
+
<div class='content'>
|
| 157 |
+
Choose a recommendation technique from the sidebar and start exploring movies tailored to your tastes.
|
| 158 |
+
<br><br>
|
| 159 |
+
Enjoy discovering new favorites!
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
""", unsafe_allow_html=True)
|
base.py
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import io
|
| 9 |
+
import nltk
|
| 10 |
+
import string
|
| 11 |
+
from nltk.corpus import stopwords
|
| 12 |
+
from nltk.tokenize import word_tokenize
|
| 13 |
+
from nltk.stem import WordNetLemmatizer
|
| 14 |
+
|
| 15 |
+
# nltk.download('punkt')
|
| 16 |
+
# nltk.download('stopwords')
|
| 17 |
+
# nltk.download('wordnet')
|
| 18 |
+
|
| 19 |
+
nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
|
| 20 |
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
| 21 |
+
nltk.data.path.append(nltk_data_dir)
|
| 22 |
+
|
| 23 |
+
def download_nltk_resources():
|
| 24 |
+
resources = ['wordnet', 'stopwords', 'punkt']
|
| 25 |
+
for res in resources:
|
| 26 |
+
try:
|
| 27 |
+
nltk.data.find(res)
|
| 28 |
+
except LookupError:
|
| 29 |
+
try:
|
| 30 |
+
nltk.download(res, download_dir=nltk_data_dir)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
st.error(f"Error downloading {res}: {e}")
|
| 33 |
+
|
| 34 |
+
# ====================== Streamlit Setup ======================
|
| 35 |
+
st.set_page_config(layout="wide")
|
| 36 |
+
st.title("Data Preprocessing")
|
| 37 |
+
st.markdown('---')
|
| 38 |
+
|
| 39 |
+
# ====================== Importing the datasets ===========================
|
| 40 |
+
sys.path.append(os.path.dirname(__file__))
|
| 41 |
+
data_dir = Path(__file__).parent / 'data'
|
| 42 |
+
|
| 43 |
+
# links dataset
|
| 44 |
+
links = data_dir / 'links.csv'
|
| 45 |
+
links = pd.read_csv(links)
|
| 46 |
+
|
| 47 |
+
# tags dataset
|
| 48 |
+
tags = data_dir / 'tags.csv'
|
| 49 |
+
tags = pd.read_csv(tags)
|
| 50 |
+
|
| 51 |
+
# movies dataset
|
| 52 |
+
movies = data_dir / 'movies.csv'
|
| 53 |
+
movies = pd.read_csv(movies)
|
| 54 |
+
|
| 55 |
+
# ratings dataset
|
| 56 |
+
# chunks = []
|
| 57 |
+
# dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
|
| 58 |
+
|
| 59 |
+
# for chunk in pd.read_csv(data_dir / 'ratings.csv', dtype=dtype_ratings, chunksize=1_000_000):
|
| 60 |
+
# chunk = chunk.dropna(subset=['userId', 'movieId'])
|
| 61 |
+
# chunks.append(chunk)
|
| 62 |
+
|
| 63 |
+
# # Concatenate all chunks into the final dataframe
|
| 64 |
+
# ratings = pd.concat(chunks, ignore_index=True)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
ratings = st.session_state['ratings_df']
|
| 68 |
+
|
| 69 |
+
# ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
|
| 70 |
+
# ratings = ratings.drop(columns=['timestamp'], errors='ignore')
|
| 71 |
+
# loc = data_dir / 'ratings.csv'
|
| 72 |
+
# ratings.to_csv(loc,index=False)
|
| 73 |
+
|
| 74 |
+
#============================ Descriptions of the datasets ===========================
|
| 75 |
+
st.subheader("ℹ️ MovieLens Datasets")
|
| 76 |
+
|
| 77 |
+
st.markdown("""
|
| 78 |
+
The **MovieLens dataset** is a benchmark dataset widely used for building and testing movie recommendation systems.
|
| 79 |
+
Below is a brief overview of the four main data files included:
|
| 80 |
+
""")
|
| 81 |
+
|
| 82 |
+
# --- Row 1: Movies and Tags ---
|
| 83 |
+
row1_col1, row1_col2 = st.columns(2)
|
| 84 |
+
|
| 85 |
+
with row1_col1:
|
| 86 |
+
st.subheader("🎬 movies.csv")
|
| 87 |
+
st.markdown("""
|
| 88 |
+
Contains metadata about each movie.
|
| 89 |
+
**Columns:**
|
| 90 |
+
- 'movieId': Unique identifier for each movie
|
| 91 |
+
- 'title': Movie title (usually includes release year)
|
| 92 |
+
- 'genres': Pipe-separated list of genres (e.g., *Action|Comedy|Drama*)
|
| 93 |
+
""")
|
| 94 |
+
|
| 95 |
+
with row1_col2:
|
| 96 |
+
st.subheader("🏷️ tags.csv")
|
| 97 |
+
st.markdown("""
|
| 98 |
+
Contains user-generated movie tags.
|
| 99 |
+
**Columns:**
|
| 100 |
+
- 'userId': User who applied the tag
|
| 101 |
+
- 'movieId': Tagged movie
|
| 102 |
+
- 'tag': User-generated keyword or phrase
|
| 103 |
+
- 'timestamp': When the tag was added (UNIX time)
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
# --- Row 2: Ratings and Links ---
|
| 107 |
+
row2_col1, row2_col2 = st.columns(2)
|
| 108 |
+
|
| 109 |
+
with row2_col1:
|
| 110 |
+
st.subheader("⭐ ratings.csv")
|
| 111 |
+
st.markdown("""
|
| 112 |
+
Contains user ratings for movies.
|
| 113 |
+
**Columns:**
|
| 114 |
+
- 'userId': Unique identifier for each user
|
| 115 |
+
- 'movieId': Rated movie
|
| 116 |
+
- 'rating': Rating given (typically from 0.5 to 5.0)
|
| 117 |
+
- 'timestamp': When the rating was recorded (UNIX time)
|
| 118 |
+
""")
|
| 119 |
+
st.markdown('*Please note that the "ratings" folder is quite large, and I have already performed some preprocessing on it. In this section, I will only be demonstrating the steps involved in the preprocessing process.*')
|
| 120 |
+
|
| 121 |
+
with row2_col2:
|
| 122 |
+
st.subheader("🔗 links.csv")
|
| 123 |
+
st.markdown("""
|
| 124 |
+
Maps MovieLens movies to external movie databases.
|
| 125 |
+
**Columns:**
|
| 126 |
+
- 'movieId': Unique identifier for each movie
|
| 127 |
+
- 'imdbId': IMDb movie ID (for cross-referencing)
|
| 128 |
+
- 'tmdbId': TMDb movie ID (for fetching posters or metadata)
|
| 129 |
+
""")
|
| 130 |
+
|
| 131 |
+
st.markdown('---')
|
| 132 |
+
|
| 133 |
+
# ========================= Displaying the dataset ===================================
|
| 134 |
+
st.subheader('📊 Sample Datasets Preview')
|
| 135 |
+
col1, col2 = st.columns(2)
|
| 136 |
+
with col1:
|
| 137 |
+
st.subheader('Movies dataset')
|
| 138 |
+
st.text(f"Shape : {movies.shape}")
|
| 139 |
+
st.dataframe(movies.head())
|
| 140 |
+
with col2:
|
| 141 |
+
st.subheader('Tags dataset')
|
| 142 |
+
st.text(f"Shape : {tags.shape}")
|
| 143 |
+
st.dataframe(tags.head())
|
| 144 |
+
col1, col2 = st.columns(2)
|
| 145 |
+
with col1:
|
| 146 |
+
st.subheader('Ratings dataset')
|
| 147 |
+
st.text(f"Shape : {ratings.shape}")
|
| 148 |
+
st.dataframe(ratings.head())
|
| 149 |
+
with col2:
|
| 150 |
+
st.subheader('Links dataset')
|
| 151 |
+
st.text(f"Shape : {links.shape}")
|
| 152 |
+
st.dataframe(links.head())
|
| 153 |
+
|
| 154 |
+
st.markdown('---')
|
| 155 |
+
|
| 156 |
+
# ====================================================== Preprocessing ================================================================
|
| 157 |
+
|
| 158 |
+
# =============================dropping not required columns=============================
|
| 159 |
+
|
| 160 |
+
st.subheader('📉 Dropping the columns that are not required')
|
| 161 |
+
st.text("Dropped 'timestamp' and 'tmdbId' columns from Ratings and Links datasets")
|
| 162 |
+
|
| 163 |
+
links = links.drop(columns = ['tmdbId'])
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
saved_ratings_movie_unique = ratings['movieId'].nunique()
|
| 167 |
+
saved_ratings_duplicates = ratings[ratings.duplicated()].shape[0]
|
| 168 |
+
|
| 169 |
+
st.text('After Removal')
|
| 170 |
+
col1, col2 = st.columns(2)
|
| 171 |
+
with col1:
|
| 172 |
+
st.subheader('Ratings dataset')
|
| 173 |
+
st.text(f"Shape : {ratings.shape}")
|
| 174 |
+
st.dataframe(ratings.head())
|
| 175 |
+
with col2:
|
| 176 |
+
st.subheader('Links dataset')
|
| 177 |
+
st.text(f"Shape : {links.shape}")
|
| 178 |
+
st.dataframe(links.head())
|
| 179 |
+
st.markdown('---')
|
| 180 |
+
|
| 181 |
+
# =============================Information about the dataset=============================
|
| 182 |
+
|
| 183 |
+
st.subheader('ℹ️ Dataset Information')
|
| 184 |
+
col1, col2 = st.columns(2)
|
| 185 |
+
with col1:
|
| 186 |
+
st.text('Movies')
|
| 187 |
+
movies_info = pd.DataFrame({
|
| 188 |
+
"Column Name": movies.columns,
|
| 189 |
+
"Non-Null Count": movies.notnull().sum().values,
|
| 190 |
+
"Data Type": movies.dtypes.values
|
| 191 |
+
})
|
| 192 |
+
st.dataframe(movies_info)
|
| 193 |
+
|
| 194 |
+
st.text('Ratings')
|
| 195 |
+
ratings_info = pd.DataFrame({
|
| 196 |
+
"Column Name": ratings.columns,
|
| 197 |
+
"Non-Null Count": ratings.notnull().sum().values,
|
| 198 |
+
"Data Type": ratings.dtypes.values
|
| 199 |
+
})
|
| 200 |
+
st.dataframe(ratings_info)
|
| 201 |
+
|
| 202 |
+
with col2:
|
| 203 |
+
st.text('Tags')
|
| 204 |
+
tags_info = pd.DataFrame({
|
| 205 |
+
"Column Name": tags.columns,
|
| 206 |
+
"Non-Null Count": tags.notnull().sum().values,
|
| 207 |
+
"Data Type": tags.dtypes.values
|
| 208 |
+
})
|
| 209 |
+
st.dataframe(tags_info)
|
| 210 |
+
|
| 211 |
+
st.text('Links')
|
| 212 |
+
links_info = pd.DataFrame({
|
| 213 |
+
"Column Name": links.columns,
|
| 214 |
+
"Non-Null Count": links.notnull().sum().values,
|
| 215 |
+
"Data Type": links.dtypes.values
|
| 216 |
+
})
|
| 217 |
+
st.dataframe(links_info)
|
| 218 |
+
|
| 219 |
+
st.markdown('---')
|
| 220 |
+
|
| 221 |
+
del(ratings)
|
| 222 |
+
|
| 223 |
+
# ============================= Duplicates hai ya nahi =============================
|
| 224 |
+
st.subheader('🧐 Checking for Duplicates')
|
| 225 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 226 |
+
with col1:
|
| 227 |
+
st.text('Movies')
|
| 228 |
+
st.text(f"# Duplicates found : {movies[movies.duplicated()].shape[0]}")
|
| 229 |
+
with col2:
|
| 230 |
+
st.text('Ratings')
|
| 231 |
+
st.text(f"# Duplicates found : {saved_ratings_duplicates}")
|
| 232 |
+
with col3:
|
| 233 |
+
st.text('Tags')
|
| 234 |
+
st.text(f"# Duplicates found : {tags[tags.duplicated()].shape[0]}")
|
| 235 |
+
st.table(tags[tags.duplicated()])
|
| 236 |
+
with col4:
|
| 237 |
+
st.text('Links')
|
| 238 |
+
st.text(f"# Duplicates found : {links[links.duplicated()].shape[0]}")
|
| 239 |
+
|
| 240 |
+
st.markdown("*Removing duplicate rows detected in the dataset.*")
|
| 241 |
+
|
| 242 |
+
st.markdown("---")
|
| 243 |
+
|
| 244 |
+
# ============================= UNique counts =============================
|
| 245 |
+
|
| 246 |
+
st.subheader('🔍 Checking for consistency across Dataframes')
|
| 247 |
+
st.text(f"Unique movies in Movies Dataset : {movies['movieId'].nunique()}")
|
| 248 |
+
st.text(f"Unique movies in Tags Dataset : {tags['movieId'].nunique()}")
|
| 249 |
+
st.text(f"Unique movies in Ratings Dataset : {saved_ratings_movie_unique}")
|
| 250 |
+
st.text(f"Unique movies in Links Dataset : {links['movieId'].nunique()}")
|
| 251 |
+
st.text(f"Unique imdbId in Links Dataset : {links['imdbId'].nunique()}")
|
| 252 |
+
|
| 253 |
+
st.markdown("*The Tags dataset has tags missing for most of the movies*")
|
| 254 |
+
st.markdown("---")
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ============================= Grouping the tags data ====================================
|
| 259 |
+
|
| 260 |
+
st.subheader('Merging Tags and Links Datasets')
|
| 261 |
+
st.markdown("""
|
| 262 |
+
- Grouping the Tags dataset based on 'movieId' and merging with the Movies dataset
|
| 263 |
+
- Merging the Links dataset with the Movies dataset based on 'imdbId'
|
| 264 |
+
""")
|
| 265 |
+
|
| 266 |
+
# grouping tags data
|
| 267 |
+
grouped_tags = tags.groupby('movieId')['tag'].agg(list).reset_index()
|
| 268 |
+
|
| 269 |
+
del(tags)
|
| 270 |
+
|
| 271 |
+
# Merging the datasets
|
| 272 |
+
movies_tag = pd.merge(movies,grouped_tags, how = 'left', on= 'movieId')
|
| 273 |
+
movies_tag = pd.merge(movies_tag,links, how = 'left', on= 'movieId')
|
| 274 |
+
|
| 275 |
+
# movies_tag.to_csv('tags_check.csv')
|
| 276 |
+
|
| 277 |
+
st.dataframe(movies_tag.head())
|
| 278 |
+
|
| 279 |
+
del(links)
|
| 280 |
+
del(movies)
|
| 281 |
+
|
| 282 |
+
st.markdown('---')
|
| 283 |
+
|
| 284 |
+
# ============================= Extracting Year and Title =============================
|
| 285 |
+
|
| 286 |
+
# st.text('Extracting the Year from the Title of the movies')
|
| 287 |
+
|
| 288 |
+
import re
|
| 289 |
+
def extract_year_and_clean_title(title):
|
| 290 |
+
# Use regex to find all years in parentheses and extract the last one
|
| 291 |
+
years = re.findall(r'\((\d{4})\)', title)
|
| 292 |
+
|
| 293 |
+
if years:
|
| 294 |
+
# The last year is the one we want
|
| 295 |
+
year = int(years[-1])
|
| 296 |
+
# Remove the last year (and parentheses) from the title
|
| 297 |
+
cleaned_title = title.replace(f"({years[-1]})", "").strip()
|
| 298 |
+
# Optionally, remove any extra parentheses and clean up
|
| 299 |
+
cleaned_title = re.sub(r'\(.*\)', '', cleaned_title).strip()
|
| 300 |
+
return year, cleaned_title
|
| 301 |
+
return 0, title # Return None if no year is found
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# Apply the function to the 'title' column to create 'year' and 'cleaned_title' columns
|
| 305 |
+
movies_tag[['year', 'title']] = movies_tag['title'].apply(lambda x: pd.Series(extract_year_and_clean_title(x)))
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# ============================= Removing the data after the parenthesis. trimming the title =============================
|
| 309 |
+
|
| 310 |
+
def bracket_title(title):
|
| 311 |
+
sp_title = title.split(' (')
|
| 312 |
+
return sp_title[0]
|
| 313 |
+
|
| 314 |
+
movies_tag['bracket_title'] = movies_tag['title'].apply(bracket_title)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def imp_title(title):
|
| 318 |
+
sp_title = title.split(',')
|
| 319 |
+
return len(sp_title)
|
| 320 |
+
|
| 321 |
+
movies_tag['imp_title'] = movies_tag['bracket_title'].apply(imp_title)
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def rearrange_last_word(text):
|
| 326 |
+
# Split by comma, preserving the commas
|
| 327 |
+
parts = text.split(', ')
|
| 328 |
+
|
| 329 |
+
# If there is more than one part, rearrange
|
| 330 |
+
if len(parts) > 1:
|
| 331 |
+
last_word = parts[-1] # Get the last part after the last comma
|
| 332 |
+
remaining_parts = parts[:-1] # Get the rest of the parts
|
| 333 |
+
# Move the last part to the front and join them with commas, no extra comma before first part
|
| 334 |
+
rearranged_text = f'{last_word} ' + ', '.join(remaining_parts)
|
| 335 |
+
else:
|
| 336 |
+
rearranged_text = text # If there's only one part, return as is
|
| 337 |
+
|
| 338 |
+
return rearranged_text
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
movies_tag['clean_title'] = movies_tag['bracket_title'].apply(rearrange_last_word)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# st.text(movies_tag['year'].unique())
|
| 345 |
+
|
| 346 |
+
movies_tag['year'] = movies_tag['year'].astype(int)
|
| 347 |
+
|
| 348 |
+
movies_tag = movies_tag[movies_tag['imp_title'].isin([1,2])]
|
| 349 |
+
|
| 350 |
+
movies_tag['genres'] = movies_tag['genres'].str.replace('|',',')
|
| 351 |
+
import ast
|
| 352 |
+
def to_list_conv(row):
|
| 353 |
+
return row.split(',')
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
movies_tag['genres'] = movies_tag['genres'].apply(to_list_conv)
|
| 357 |
+
|
| 358 |
+
st.subheader('✅ After cleaning & merging the datasets')
|
| 359 |
+
st.markdown("""
|
| 360 |
+
- Extracting the Year from the Title of the movies
|
| 361 |
+
- Converting the Year column to integer data type
|
| 362 |
+
- Replacing non-integer values in the Year column with 0
|
| 363 |
+
- Converting the Genres column to a list after replacing the '|' separator
|
| 364 |
+
""")
|
| 365 |
+
|
| 366 |
+
st.dataframe(movies_tag[['movieId','clean_title','year','genres','tag']].head(100))
|
| 367 |
+
|
| 368 |
+
st.markdown("""- 📌 Number of movies with missing tags: {0}""".format(movies_tag['tag'].isna().sum()))
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
st.markdown('---')
|
| 372 |
+
# ============================================ Plot Dataset extraction =========================================
|
| 373 |
+
|
| 374 |
+
st.subheader("🎬 Extracting Movie Details from IMDb")
|
| 375 |
+
st.text("Extracting the Plot, Year & Genres of the movies from the 'imdbinfo' package with the help of the 'imdbID' column in the Links dataset")
|
| 376 |
+
|
| 377 |
+
movies_merge = data_dir / 'merger.csv'
|
| 378 |
+
movies_merge = pd.read_csv(movies_merge)
|
| 379 |
+
st.dataframe(movies_merge[['title','year','plot','genres']])
|
| 380 |
+
|
| 381 |
+
st.text(movies_merge.shape)
|
| 382 |
+
|
| 383 |
+
# ============================================ Plot Dataset Check =========================================
|
| 384 |
+
|
| 385 |
+
st.markdown('- **Checking for NULL values in the Dataset - Match not found during the extraction process**')
|
| 386 |
+
col1,col2 = st.columns(2)
|
| 387 |
+
with col1:
|
| 388 |
+
st.text('# Empty fields')
|
| 389 |
+
st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
|
| 390 |
+
with col2:
|
| 391 |
+
st.text('After removing the rows with empty title - no match found')
|
| 392 |
+
movies_merge = movies_merge.dropna(subset=['title'])
|
| 393 |
+
st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
st.markdown("- **Same as in the Movies dataset converting the Year column to Integer and replacing NULLs with 0**")
|
| 398 |
+
# st.text(movies_merge['year'].unique())
|
| 399 |
+
movies_merge['year'] = movies_merge['year'].fillna(0)
|
| 400 |
+
movies_merge['year'] = movies_merge['year'].astype(int)
|
| 401 |
+
|
| 402 |
+
movies_merge['title'] = movies_merge['title'].str.strip()
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
# - Null IMDB Ids (No match found) : {movies_merge['imdbId'].isna().sum()}
|
| 407 |
+
st.markdown(f"""
|
| 408 |
+
- **Checking for duplicates in the extracted file keeping the NULLs aside**
|
| 409 |
+
- Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
|
| 410 |
+
- Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
|
| 411 |
+
- Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
|
| 412 |
+
- Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
|
| 413 |
+
""")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
st.markdown('- **Checking why are there Duplicates present in the title column**')
|
| 417 |
+
st.dataframe(movies_merge[movies_merge[['title', 'year']].duplicated(keep=False)].sort_values('title').reset_index(drop=True))
|
| 418 |
+
st.text('Because of difference in year the titles are duplicated or because of multiple description for the plot field')
|
| 419 |
+
|
| 420 |
+
st.markdown("- **Removing the Duplicates basis the Title and the Year column**")
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
movies_merge = movies_merge.drop_duplicates(subset=['title', 'year'], keep='first')
|
| 424 |
+
|
| 425 |
+
st.markdown(f"""
|
| 426 |
+
- **Checking for duplicates in the extracted file keeping the NULLs aside**
|
| 427 |
+
- Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
|
| 428 |
+
- Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
|
| 429 |
+
- Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
|
| 430 |
+
- Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
|
| 431 |
+
""")
|
| 432 |
+
|
| 433 |
+
st.markdown('- **Checking why are there Duplicates present in the plot column apart from NULL values**')
|
| 434 |
+
st.dataframe(movies_merge[movies_merge['plot'].notna() & movies_merge['plot'].duplicated(keep=False)].sort_values('plot').reset_index(drop=True))
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
movies_merge = movies_merge[~movies_merge['plot'].duplicated(keep=False)]
|
| 438 |
+
st.markdown("- **Removing the Duplicates basis the Plot column as we dont know for which movie the plot is correct**")
|
| 439 |
+
# movies_merge.drop_duplicates(subset='plot', keep='first', inplace=True)
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
st.markdown(f"""
|
| 443 |
+
- **Checking for duplicates in the extracted file keeping the NULLs aside**
|
| 444 |
+
- Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
|
| 445 |
+
- Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
|
| 446 |
+
- Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
|
| 447 |
+
- Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
st.markdown('---')
|
| 455 |
+
# ========================================= merging with the main dataset =================================
|
| 456 |
+
|
| 457 |
+
st.subheader('🧠 Merging the Dataset for the Plot and Genres columns')
|
| 458 |
+
st.text('Joined on Title and Year column')
|
| 459 |
+
|
| 460 |
+
# movies_tag.to_csv('chek_movies_tag.csv')
|
| 461 |
+
# movies_merge.to_csv('chek_movies_merge.csv')
|
| 462 |
+
|
| 463 |
+
movies_tag = pd.merge(movies_tag,movies_merge, how = 'left', left_on=['clean_title', 'year'], right_on=['title', 'year'])
|
| 464 |
+
movies_tag = movies_tag.reset_index(drop=True)
|
| 465 |
+
|
| 466 |
+
# movies_tag.to_csv('chek_movies_tag_after_join.csv')
|
| 467 |
+
|
| 468 |
+
# ========================================= Converting to meaningfull tags =================================
|
| 469 |
+
|
| 470 |
+
st.subheader("Cleaning and Merging Movie Tags & Genres")
|
| 471 |
+
st.markdown("""
|
| 472 |
+
We are performing the following preprocessing steps on the dataset:
|
| 473 |
+
|
| 474 |
+
- **Standardizing genres and tags**
|
| 475 |
+
- Convert all entries in 'genres_x', 'genres_y', and 'tag' columns to lowercase.
|
| 476 |
+
- Ensure only valid string items are kept and handle empty or missing lists.
|
| 477 |
+
""")
|
| 478 |
+
|
| 479 |
+
movies_tag['genres_x'] = movies_tag['genres_x'].apply(
|
| 480 |
+
lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
|
| 481 |
+
)
|
| 482 |
+
movies_tag['tag'] = movies_tag['tag'].apply(
|
| 483 |
+
lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
st.markdown("""
|
| 487 |
+
- **Converting string representations to lists for 'genres_y'**
|
| 488 |
+
- Some entries might be stored as strings, so we safely convert them to Python lists.
|
| 489 |
+
- Lowercase all valid genre entries.
|
| 490 |
+
""")
|
| 491 |
+
def to_list_conv(x):
|
| 492 |
+
if pd.isna(x):
|
| 493 |
+
return []
|
| 494 |
+
try:
|
| 495 |
+
return ast.literal_eval(x)
|
| 496 |
+
except (ValueError, SyntaxError):
|
| 497 |
+
return []
|
| 498 |
+
|
| 499 |
+
movies_tag['genres_y'] = movies_tag['genres_y'].apply(to_list_conv)
|
| 500 |
+
movies_tag['genres_y'] = movies_tag['genres_y'].apply(
|
| 501 |
+
lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
st.markdown("""
|
| 505 |
+
- **Combining all genres into a single column**
|
| 506 |
+
- Merge 'genres_x' and 'genres_y' into a unified 'genres' column.
|
| 507 |
+
- Remove duplicate genres per movie.
|
| 508 |
+
- Drop the original 'genres_x' and 'genres_y' columns.
|
| 509 |
+
""")
|
| 510 |
+
movies_tag['genres'] = movies_tag.apply(lambda row: list(set(row['genres_x'] + row['genres_y'])), axis=1)
|
| 511 |
+
movies_tag['tag'] = movies_tag.apply(lambda row: list(set(row['tag'])), axis=1)
|
| 512 |
+
movies_tag = movies_tag.drop(columns=['genres_x','genres_y'])
|
| 513 |
+
|
| 514 |
+
st.subheader("Preprocessing Movie Plots")
|
| 515 |
+
st.markdown("""
|
| 516 |
+
- Clean the 'plot' text by:
|
| 517 |
+
1. Lowercasing all words
|
| 518 |
+
2. Removing punctuation
|
| 519 |
+
3. Tokenizing into words
|
| 520 |
+
4. Removing stopwords
|
| 521 |
+
5. Lemmatizing words
|
| 522 |
+
""")
|
| 523 |
+
lemmatizer = WordNetLemmatizer()
|
| 524 |
+
def preprocess_sentence(sentence):
|
| 525 |
+
if not isinstance(sentence, str):
|
| 526 |
+
return []
|
| 527 |
+
stop_words = set(stopwords.words('english'))
|
| 528 |
+
sentence = re.sub(r'[^\w\s]', '', sentence)
|
| 529 |
+
sentence = sentence.lower()
|
| 530 |
+
tokens = word_tokenize(sentence)
|
| 531 |
+
tokens = [word for word in tokens if word not in stop_words]
|
| 532 |
+
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
| 533 |
+
return lemmatized_tokens
|
| 534 |
+
|
| 535 |
+
movies_tag['plot'] = movies_tag['plot'].apply(preprocess_sentence)
|
| 536 |
+
|
| 537 |
+
st.subheader("Cleaning Columns and Removing Duplicates")
|
| 538 |
+
st.markdown("""
|
| 539 |
+
- Drop unnecessary columns from the merged dataset.
|
| 540 |
+
- Rename 'clean_title' to 'title'.
|
| 541 |
+
- Reorder columns for clarity: 'movieId', 'title', 'year', 'tag', 'genres', 'plot'.
|
| 542 |
+
""")
|
| 543 |
+
movies_tag = movies_tag.drop(columns=['title_x', 'imdbId', 'title_y','bracket_title','imp_title'])
|
| 544 |
+
movies_tag.rename(columns = {'clean_title':'title'}, inplace=True)
|
| 545 |
+
# movies_tag.drop_duplicates(subset='title', keep='first', inplace=True)
|
| 546 |
+
|
| 547 |
+
# movies_tag = movies_tag.sort_values(by=['title', 'year'], ascending=[True, False])
|
| 548 |
+
# movies_tag = movies_tag.drop_duplicates(subset='title', keep='first')
|
| 549 |
+
|
| 550 |
+
movies_tag = movies_tag[['movieId','title','year','tag','genres','plot']]
|
| 551 |
+
|
| 552 |
+
st.markdown('---')
|
| 553 |
+
|
| 554 |
+
# ============================== Displaying final movies dataset =====================================
|
| 555 |
+
item_to_remove = '(no genres listed)'
|
| 556 |
+
movies_tag['genres'] = movies_tag['genres'].apply(lambda x: [item for item in x if item != item_to_remove])
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
st.subheader("🎬 Movies Dataset After Merging and Cleaning")
|
| 560 |
+
st.dataframe(movies_tag)
|
| 561 |
+
|
| 562 |
+
st.markdown(f"**Total unique movies after filtering:** {movies_tag['movieId'].nunique()}")
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
# ================================================ removing duplicates =============================================================
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
def check_empty_lists(row):
|
| 569 |
+
return isinstance(row['tag'], list) and len(row['tag']) == 0 and \
|
| 570 |
+
isinstance(row['genres'], list) and len(row['genres']) == 0 and \
|
| 571 |
+
isinstance(row['plot'], list) and len(row['plot']) == 0
|
| 572 |
+
|
| 573 |
+
# Step 1: Count rows where all three columns ('tag', 'genres', 'plot') are empty lists
|
| 574 |
+
empty_rows_count = movies_tag[movies_tag.apply(check_empty_lists, axis=1)].shape[0]
|
| 575 |
+
st.markdown(f"**Number of rows where 'tag', 'genres', and 'plot' are empty lists:** {empty_rows_count}")
|
| 576 |
+
|
| 577 |
+
# Step 2: Drop rows where all three columns are empty lists
|
| 578 |
+
movies_tag_cleaned = movies_tag[~movies_tag.apply(check_empty_lists, axis=1)]
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
#================ extracting the ratings dataset=============================
|
| 582 |
+
|
| 583 |
+
movies_tag_cleaned = movies_tag_cleaned.drop_duplicates(subset=['title', 'year'], keep='first')
|
| 584 |
+
|
| 585 |
+
# ratings = ratings[ratings['movieId'].isin(movies_tag_cleaned['movieId'])]
|
| 586 |
+
# ratings = ratings.drop_duplicates()
|
| 587 |
+
# ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='last')
|
| 588 |
+
|
| 589 |
+
# ratings.to_csv(loc, index=False)
|
| 590 |
+
|
| 591 |
+
# Correct way to save without creating Unnamed columns
|
| 592 |
+
|
| 593 |
+
# del(ratings)
|
| 594 |
+
|
| 595 |
+
st.subheader('Dropping the rows with empty Tag, Genres and Plot')
|
| 596 |
+
# movies_tag_cleaned.to_csv('movies_final.csv', index=False)
|
| 597 |
+
|
| 598 |
+
st.markdown(f"**Total unique movies after filtering:** {movies_tag_cleaned['movieId'].nunique()}")
|
| 599 |
+
|
| 600 |
+
st.markdown('---')
|
| 601 |
+
|
| 602 |
+
st.subheader("🎬 Final Movies Dataset")
|
| 603 |
+
st.dataframe(movies_tag_cleaned)
|
collaborative_item.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
from scipy.sparse import coo_matrix, csr_matrix
|
| 9 |
+
from sklearn.decomposition import IncrementalPCA
|
| 10 |
+
from sklearn.preprocessing import normalize
|
| 11 |
+
import joblib
|
| 12 |
+
|
| 13 |
+
# ====================== Streamlit Setup ======================
|
| 14 |
+
st.set_page_config(layout="wide")
|
| 15 |
+
st.title("🎬 Movie Recommender System Item-Based")
|
| 16 |
+
|
| 17 |
+
# ====================== Paths ======================
|
| 18 |
+
sys.path.append(os.path.dirname(__file__))
|
| 19 |
+
data_dir = Path(__file__).parent / 'data'
|
| 20 |
+
cache_dir = data_dir / 'cache'
|
| 21 |
+
cache_dir.mkdir(exist_ok=True)
|
| 22 |
+
|
| 23 |
+
movies_path = data_dir / 'movies.csv'
|
| 24 |
+
ratings_path = data_dir / 'ratings.csv'
|
| 25 |
+
|
| 26 |
+
# ====================== Load Data ======================
|
| 27 |
+
@st.cache_data
|
| 28 |
+
def load_data():
|
| 29 |
+
movies = pd.read_csv(movies_path)
|
| 30 |
+
ratings = st.session_state['ratings_df']
|
| 31 |
+
ratings = ratings.drop(columns=['timestamp'], errors='ignore')
|
| 32 |
+
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
|
| 33 |
+
ratings['userId'] = ratings['userId'].astype(int)
|
| 34 |
+
ratings['movieId'] = ratings['movieId'].astype(int)
|
| 35 |
+
# Keep only movies present in movies.csv
|
| 36 |
+
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
|
| 37 |
+
return movies, ratings
|
| 38 |
+
|
| 39 |
+
movies, ratings = load_data()
|
| 40 |
+
st.text(f"Initial ratings shape: {ratings.shape}")
|
| 41 |
+
|
| 42 |
+
# ====================== Data Filtering ======================
|
| 43 |
+
|
| 44 |
+
user_counts = ratings['userId'].value_counts()
|
| 45 |
+
active_users = user_counts[user_counts >= 450].index
|
| 46 |
+
ratings_filtered = ratings[ratings['userId'].isin(active_users)]
|
| 47 |
+
|
| 48 |
+
movie_counts = ratings_filtered['movieId'].value_counts()
|
| 49 |
+
popular_movies = movie_counts[movie_counts >= 450].index
|
| 50 |
+
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
|
| 51 |
+
ratings = ratings_filtered.copy()
|
| 52 |
+
del ratings_filtered
|
| 53 |
+
|
| 54 |
+
st.text(f"Filtered ratings shape: {ratings.shape}")
|
| 55 |
+
|
| 56 |
+
#===================summary=================================
|
| 57 |
+
st.markdown("#### 🎯 Dataset Filtering Summary")
|
| 58 |
+
st.markdown("""
|
| 59 |
+
We filtered the dataset in **two steps** to focus on active users and popular movies:
|
| 60 |
+
|
| 61 |
+
1. **Active Users**
|
| 62 |
+
- Kept users who have rated **at least 450 movies**.
|
| 63 |
+
- Ensures we focus on users who are actively engaged.
|
| 64 |
+
|
| 65 |
+
2. **Popular Movies**
|
| 66 |
+
- Kept movies that have **at least 450 ratings**.
|
| 67 |
+
- Ensures we focus on movies with enough feedback to be meaningful.
|
| 68 |
+
|
| 69 |
+
**Result:**
|
| 70 |
+
The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
|
| 71 |
+
""")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ====================== Index Mapping ======================
|
| 75 |
+
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
|
| 76 |
+
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
|
| 77 |
+
|
| 78 |
+
num_users = ratings['user_idx'].nunique()
|
| 79 |
+
num_movies = ratings['movie_idx'].nunique()
|
| 80 |
+
|
| 81 |
+
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
|
| 82 |
+
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
|
| 83 |
+
|
| 84 |
+
# ====================== Sparse Matrix ======================
|
| 85 |
+
user_item_matrix = coo_matrix(
|
| 86 |
+
(ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
|
| 87 |
+
shape=(num_users, num_movies)
|
| 88 |
+
).tocsr()
|
| 89 |
+
|
| 90 |
+
# ====================== Incremental SVD Caching ======================
|
| 91 |
+
|
| 92 |
+
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
|
| 93 |
+
|
| 94 |
+
def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
|
| 95 |
+
if SVD_CACHE_PATH.exists():
|
| 96 |
+
# st.text("✅ Loading cached SVD decomposition...")
|
| 97 |
+
U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
|
| 98 |
+
return U, Sigma, VT
|
| 99 |
+
|
| 100 |
+
st.text("⚙️ Computing Incremental SVD... (may take a few minutes)")
|
| 101 |
+
ipca = IncrementalPCA(n_components=n_components)
|
| 102 |
+
n_samples = matrix.shape[0]
|
| 103 |
+
|
| 104 |
+
# Fit in chunks
|
| 105 |
+
for start_idx in range(0, n_samples, batch_size):
|
| 106 |
+
end_idx = min(start_idx + batch_size, n_samples)
|
| 107 |
+
batch = matrix[start_idx:end_idx].toarray()
|
| 108 |
+
ipca.partial_fit(batch)
|
| 109 |
+
st.text(f"Processed rows {start_idx} to {end_idx}")
|
| 110 |
+
|
| 111 |
+
# Transform full data
|
| 112 |
+
U = []
|
| 113 |
+
for start_idx in range(0, n_samples, batch_size):
|
| 114 |
+
end_idx = min(start_idx + batch_size, n_samples)
|
| 115 |
+
batch = matrix[start_idx:end_idx].toarray()
|
| 116 |
+
U.append(ipca.transform(batch))
|
| 117 |
+
U = np.vstack(U)
|
| 118 |
+
|
| 119 |
+
VT = ipca.components_
|
| 120 |
+
Sigma = np.ones(ipca.n_components_)
|
| 121 |
+
|
| 122 |
+
joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
|
| 123 |
+
st.text("✅ SVD computation cached successfully.")
|
| 124 |
+
return U, Sigma, VT
|
| 125 |
+
|
| 126 |
+
U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
|
| 127 |
+
U_normalized = normalize(U)
|
| 128 |
+
|
| 129 |
+
# ====================== Recommendation Function ======================
|
| 130 |
+
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
|
| 131 |
+
start = time.time()
|
| 132 |
+
if user_id not in ratings['userId'].values:
|
| 133 |
+
return "User not found.", 0
|
| 134 |
+
|
| 135 |
+
user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
|
| 136 |
+
|
| 137 |
+
# Cosine similarity in latent space
|
| 138 |
+
user_vector = U_normalized[user_idx]
|
| 139 |
+
similarities = np.dot(U_normalized, user_vector)
|
| 140 |
+
|
| 141 |
+
# Top similar users
|
| 142 |
+
similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
|
| 143 |
+
similar_scores = similarities[similar_user_indices]
|
| 144 |
+
|
| 145 |
+
# Movies already rated
|
| 146 |
+
user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
|
| 147 |
+
user_rated_movie_set = set(user_rated_movie_indices)
|
| 148 |
+
|
| 149 |
+
# Aggregate weighted ratings
|
| 150 |
+
scores = {}
|
| 151 |
+
for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
|
| 152 |
+
sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
|
| 153 |
+
movie_indices = sim_user_ratings.nonzero()[1]
|
| 154 |
+
sim_ratings = sim_user_ratings.data
|
| 155 |
+
for m_idx, rating in zip(movie_indices, sim_ratings):
|
| 156 |
+
if m_idx not in user_rated_movie_set:
|
| 157 |
+
scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
|
| 158 |
+
|
| 159 |
+
# Sort and recommend
|
| 160 |
+
recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
|
| 161 |
+
recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
|
| 162 |
+
|
| 163 |
+
end = time.time()
|
| 164 |
+
recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title','genres']]
|
| 165 |
+
return recommendations, end - start
|
| 166 |
+
|
| 167 |
+
# ====================== Streamlit UI ======================
|
| 168 |
+
|
| 169 |
+
user_id_input = st.number_input(
|
| 170 |
+
"Enter User ID",
|
| 171 |
+
min_value=int(ratings['user_idx'].min()),
|
| 172 |
+
max_value=int(ratings['user_idx'].max()),
|
| 173 |
+
step=1
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
duration = 0
|
| 180 |
+
if st.button("🎥 Recommend Movies"):
|
| 181 |
+
col1, col2 = st.columns(2)
|
| 182 |
+
with col1:
|
| 183 |
+
st.text('User Activity')
|
| 184 |
+
current_user = ratings[ratings['userId']==user_id_input]
|
| 185 |
+
merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
|
| 186 |
+
merge_results = merge_results[['title','rating']]
|
| 187 |
+
st.dataframe(merge_results)
|
| 188 |
+
with col2:
|
| 189 |
+
st.text('Recommended Movies')
|
| 190 |
+
recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
|
| 191 |
+
st.dataframe(recs['title'])
|
| 192 |
+
st.text(f"Computed in {duration:.2f} seconds.")
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# ====================== Explanation of CF ======================
|
| 196 |
+
# st.markdown("""
|
| 197 |
+
# ## 🤝 Collaborative Filtering Types
|
| 198 |
+
|
| 199 |
+
# **User-Based CF**:
|
| 200 |
+
# - Focus on users similar to you
|
| 201 |
+
# - Recommend movies liked by similar users
|
| 202 |
+
|
| 203 |
+
# **Item-Based CF**:
|
| 204 |
+
# - Focus on movies similar to ones you liked
|
| 205 |
+
# - Recommend similar movies
|
| 206 |
+
|
| 207 |
+
# Example:
|
| 208 |
+
# - You liked *Inception* and *Matrix*
|
| 209 |
+
# - User-Based: "People like you also liked *Avengers*"
|
| 210 |
+
# - Item-Based: "Since you liked *Inception*, try *Interstellar*"
|
| 211 |
+
# """)
|
collaborative_user.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
from scipy.sparse import coo_matrix, csr_matrix
|
| 9 |
+
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
|
| 10 |
+
from sklearn.preprocessing import normalize
|
| 11 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
+
import joblib
|
| 13 |
+
|
| 14 |
+
# ====================== Streamlit Setup ======================
|
| 15 |
+
st.set_page_config(layout="wide")
|
| 16 |
+
st.title("🎬 Movie Recommender System User-Based")
|
| 17 |
+
|
| 18 |
+
# ====================== Paths ======================
|
| 19 |
+
sys.path.append(os.path.dirname(__file__))
|
| 20 |
+
data_dir = Path(__file__).parent / 'data'
|
| 21 |
+
cache_dir = data_dir / 'cache'
|
| 22 |
+
cache_dir.mkdir(exist_ok=True)
|
| 23 |
+
|
| 24 |
+
movies_path = 'movies_final.csv'
|
| 25 |
+
ratings_path = data_dir / 'ratings.csv'
|
| 26 |
+
|
| 27 |
+
# ====================== Load Data ======================
|
| 28 |
+
|
| 29 |
+
# used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns.
|
| 30 |
+
# This is especially useful when loading large datasets or performing expensive computations.Since our data is huge
|
| 31 |
+
|
| 32 |
+
@st.cache_data
|
| 33 |
+
def load_data():
|
| 34 |
+
movies = pd.read_csv(movies_path)
|
| 35 |
+
ratings = st.session_state['ratings_df']
|
| 36 |
+
# ratings = ratings.drop(columns=['timestamp'], errors='ignore')
|
| 37 |
+
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
|
| 38 |
+
ratings['userId'] = ratings['userId'].astype(int)
|
| 39 |
+
ratings['movieId'] = ratings['movieId'].astype(int)
|
| 40 |
+
return movies, ratings
|
| 41 |
+
|
| 42 |
+
movies, ratings = load_data()
|
| 43 |
+
|
| 44 |
+
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
|
| 45 |
+
|
| 46 |
+
st.text(f"Initial ratings shape: {ratings.shape}")
|
| 47 |
+
|
| 48 |
+
# ====================== Data Filtering ======================
|
| 49 |
+
user_counts = ratings['userId'].value_counts()
|
| 50 |
+
active_users = user_counts[user_counts >= 450].index
|
| 51 |
+
ratings_filtered = ratings[ratings['userId'].isin(active_users)]
|
| 52 |
+
|
| 53 |
+
movie_counts = ratings_filtered['movieId'].value_counts()
|
| 54 |
+
popular_movies = movie_counts[movie_counts >= 450].index
|
| 55 |
+
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
|
| 56 |
+
ratings = ratings_filtered.copy()
|
| 57 |
+
del ratings_filtered
|
| 58 |
+
|
| 59 |
+
st.text(f"Filtered ratings shape: {ratings.shape}")
|
| 60 |
+
|
| 61 |
+
st.markdown("#### 🎯 Dataset Filtering Summary")
|
| 62 |
+
st.markdown("""
|
| 63 |
+
We filtered the dataset in **two steps** to focus on active users and popular movies:
|
| 64 |
+
|
| 65 |
+
1. **Active Users**
|
| 66 |
+
- Kept users who have rated **at least 450 movies**.
|
| 67 |
+
- Ensures we focus on users who are actively engaged.
|
| 68 |
+
|
| 69 |
+
2. **Popular Movies**
|
| 70 |
+
- Kept movies that have **at least 450 ratings**.
|
| 71 |
+
- Ensures we focus on movies with enough feedback to be meaningful.
|
| 72 |
+
|
| 73 |
+
**Result:**
|
| 74 |
+
The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
|
| 75 |
+
""")
|
| 76 |
+
|
| 77 |
+
formatted_num = "{:,}".format(ratings.shape[0])
|
| 78 |
+
print(formatted_num)
|
| 79 |
+
|
| 80 |
+
# Optional: Show numbers dynamically
|
| 81 |
+
st.markdown(f"**📊 Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ====================== Index Mapping ======================
|
| 85 |
+
# For each unique userId, assigns a unique integer code starting from 0.
|
| 86 |
+
ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
|
| 87 |
+
ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
|
| 88 |
+
|
| 89 |
+
num_users = ratings['user_idx'].nunique()
|
| 90 |
+
num_movies = ratings['movie_idx'].nunique()
|
| 91 |
+
|
| 92 |
+
# Map indices back to original IDs
|
| 93 |
+
# {0: 100, 1: 101, 2: 105}
|
| 94 |
+
user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
|
| 95 |
+
movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
|
| 96 |
+
|
| 97 |
+
# ====================== Sparse Matrix ======================
|
| 98 |
+
|
| 99 |
+
# Rows → users
|
| 100 |
+
# Columns → movies
|
| 101 |
+
# Values → ratings
|
| 102 |
+
# Most entries are **0 (or missing)** because not all users rate all movies.
|
| 103 |
+
|
| 104 |
+
user_item_matrix = coo_matrix(
|
| 105 |
+
(ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
|
| 106 |
+
shape=(num_users, num_movies)
|
| 107 |
+
).tocsr()
|
| 108 |
+
|
| 109 |
+
# CSR format allows fast operations for matrix factorization or nearest-neighbor search
|
| 110 |
+
# The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
|
| 111 |
+
# Dense matrix: stores all 50 million entries → huge memory usage.Sparse matrix: stores only the non-zero ratings → huge memory savings.
|
| 112 |
+
# There are different sparse formats, e.g., COO, CSR, CSC.
|
| 113 |
+
|
| 114 |
+
# ====================== Incremental SVD Caching ======================
|
| 115 |
+
SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
|
| 116 |
+
|
| 117 |
+
def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
|
| 118 |
+
"""
|
| 119 |
+
Memory-efficient incremental SVD using IncrementalPCA.
|
| 120 |
+
Works even when matrix cannot fit entirely in memory.
|
| 121 |
+
"""
|
| 122 |
+
if SVD_CACHE_PATH.exists():
|
| 123 |
+
# st.text("✅ Loading cached SVD decomposition...")
|
| 124 |
+
U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
|
| 125 |
+
return U, Sigma, VT
|
| 126 |
+
|
| 127 |
+
st.text("⚙️ Computing Incremental SVD... (this may take several minutes)")
|
| 128 |
+
ipca = IncrementalPCA(n_components=n_components)
|
| 129 |
+
n_samples = matrix.shape[0]
|
| 130 |
+
|
| 131 |
+
# Fit in chunks
|
| 132 |
+
for start_idx in range(0, n_samples, batch_size):
|
| 133 |
+
end_idx = min(start_idx + batch_size, n_samples)
|
| 134 |
+
batch = matrix[start_idx:end_idx].toarray()
|
| 135 |
+
ipca.partial_fit(batch)
|
| 136 |
+
st.text(f"Processed rows {start_idx} to {end_idx}")
|
| 137 |
+
|
| 138 |
+
# Transform full data
|
| 139 |
+
U = []
|
| 140 |
+
for start_idx in range(0, n_samples, batch_size):
|
| 141 |
+
end_idx = min(start_idx + batch_size, n_samples)
|
| 142 |
+
batch = matrix[start_idx:end_idx].toarray()
|
| 143 |
+
U.append(ipca.transform(batch))
|
| 144 |
+
U = np.vstack(U)
|
| 145 |
+
|
| 146 |
+
VT = ipca.components_
|
| 147 |
+
Sigma = np.ones(ipca.n_components_)
|
| 148 |
+
|
| 149 |
+
joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
|
| 150 |
+
st.text("✅ SVD computation cached successfully.")
|
| 151 |
+
return U, Sigma, VT
|
| 152 |
+
|
| 153 |
+
U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
|
| 154 |
+
U_normalized = normalize(U)
|
| 155 |
+
|
| 156 |
+
# ====================== Recommendation Function ======================
|
| 157 |
+
def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
|
| 158 |
+
start = time.time()
|
| 159 |
+
if user_id not in ratings['userId'].values:
|
| 160 |
+
return "User not found.", 0
|
| 161 |
+
|
| 162 |
+
user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
|
| 163 |
+
|
| 164 |
+
# Cosine similarity in latent space
|
| 165 |
+
user_vector = U_normalized[user_idx]
|
| 166 |
+
similarities = np.dot(U_normalized, user_vector)
|
| 167 |
+
|
| 168 |
+
# Select top similar users
|
| 169 |
+
similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
|
| 170 |
+
similar_scores = similarities[similar_user_indices]
|
| 171 |
+
|
| 172 |
+
user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
|
| 173 |
+
user_rated_movie_set = set(user_rated_movie_indices)
|
| 174 |
+
|
| 175 |
+
# Aggregate weighted ratings
|
| 176 |
+
scores = {}
|
| 177 |
+
for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
|
| 178 |
+
sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
|
| 179 |
+
movie_indices = sim_user_ratings.nonzero()[1]
|
| 180 |
+
sim_ratings = sim_user_ratings.data
|
| 181 |
+
for m_idx, rating in zip(movie_indices, sim_ratings):
|
| 182 |
+
if m_idx not in user_rated_movie_set:
|
| 183 |
+
scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
|
| 184 |
+
|
| 185 |
+
# Sort and recommend
|
| 186 |
+
recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
|
| 187 |
+
recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
|
| 188 |
+
|
| 189 |
+
end = time.time()
|
| 190 |
+
recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
|
| 191 |
+
return recommendations, end - start
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# 1. Works in latent space → captures hidden user preferences.
|
| 195 |
+
# 2. Uses top similar users → collaborative filtering logic.
|
| 196 |
+
# 3. Avoids recommending already rated movies.
|
| 197 |
+
# 4. Weighted aggregation ensures more similar users influence recommendations more.
|
| 198 |
+
# 5. Returns top N movies along with computation time.
|
| 199 |
+
|
| 200 |
+
# ====================== Streamlit UI ===========================================
|
| 201 |
+
|
| 202 |
+
# st.dataframe(ratings.head(1000))
|
| 203 |
+
# st.text('unique')
|
| 204 |
+
# st.text(ratings['user_idx'].nunique())
|
| 205 |
+
# st.text(ratings['userId'].nunique())
|
| 206 |
+
|
| 207 |
+
user_id_input = st.number_input(
|
| 208 |
+
"Enter User ID",
|
| 209 |
+
min_value=int(ratings['user_idx'].min()),
|
| 210 |
+
max_value=int(ratings['user_idx'].max()),
|
| 211 |
+
step=1
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
|
| 216 |
+
|
| 217 |
+
# st.dataframe(ratings)
|
| 218 |
+
|
| 219 |
+
# st.text(user_id_input)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
if st.button("🎥 Recommend Movies"):
|
| 223 |
+
|
| 224 |
+
col1, col2 = st.columns(2)
|
| 225 |
+
with col1:
|
| 226 |
+
st.text('User Activity')
|
| 227 |
+
current_user = ratings[ratings['userId']==user_id_input]
|
| 228 |
+
merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
|
| 229 |
+
merge_results = merge_results[['title','rating']]
|
| 230 |
+
st.dataframe(merge_results)
|
| 231 |
+
with col2:
|
| 232 |
+
st.text('Recommended Movies')
|
| 233 |
+
recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
|
| 234 |
+
st.dataframe(recs)
|
| 235 |
+
st.text(f"Computed in {duration:.2f} seconds.")
|
content_based.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import streamlit as st
|
| 6 |
+
from gensim.models import Word2Vec
|
| 7 |
+
|
| 8 |
+
# ====================== Streamlit Setup ======================
|
| 9 |
+
st.set_page_config(layout="wide")
|
| 10 |
+
st.title("Content Based Filtering")
|
| 11 |
+
st.markdown('---')
|
| 12 |
+
|
| 13 |
+
# ======================== Load Data ==========================
|
| 14 |
+
movies = pd.read_csv('movies_final.csv')
|
| 15 |
+
movies = movies.drop(columns=['movieId'])
|
| 16 |
+
|
| 17 |
+
# ======================== Helper Function ====================
|
| 18 |
+
def combine_tokens(row, cols):
|
| 19 |
+
combined = []
|
| 20 |
+
for col in cols:
|
| 21 |
+
val = row[col]
|
| 22 |
+
if isinstance(val, list):
|
| 23 |
+
combined.extend(val)
|
| 24 |
+
elif isinstance(val, str):
|
| 25 |
+
combined.append(val)
|
| 26 |
+
return list(set(combined))
|
| 27 |
+
|
| 28 |
+
# ======================== BOW ================================
|
| 29 |
+
def recommend_bow(title, year, top_n=5):
|
| 30 |
+
if title not in movies['title'].values:
|
| 31 |
+
return "Movie not found."
|
| 32 |
+
|
| 33 |
+
# Get index of the queried movie
|
| 34 |
+
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| 35 |
+
|
| 36 |
+
# Compute cosine similarity scores
|
| 37 |
+
query_vector = bow_matrix[idx]
|
| 38 |
+
sim_scores = cosine_similarity(query_vector, bow_matrix).flatten()
|
| 39 |
+
|
| 40 |
+
# Get indices of similar movies (excluding the query itself)
|
| 41 |
+
similar_indices = sim_scores.argsort()[::-1][1:top_n + 1]
|
| 42 |
+
|
| 43 |
+
# Create a DataFrame for top recommendations with similarity scores
|
| 44 |
+
recommendations = pd.DataFrame({
|
| 45 |
+
'title': movies.iloc[similar_indices]['title'].values,
|
| 46 |
+
'year': movies.iloc[similar_indices]['year'].values,
|
| 47 |
+
'similarity_score': sim_scores[similar_indices]
|
| 48 |
+
})
|
| 49 |
+
|
| 50 |
+
# Sort for clarity (highest similarity first)
|
| 51 |
+
recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
|
| 52 |
+
|
| 53 |
+
return recommendations
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ======================== TF-IDF ==============================
|
| 57 |
+
def recommend_tfidf(title, year, top_n=5):
|
| 58 |
+
if title not in movies['title'].values:
|
| 59 |
+
return "Movie not found."
|
| 60 |
+
|
| 61 |
+
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| 62 |
+
query_vector = tfidf_matrix[idx]
|
| 63 |
+
sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
| 64 |
+
|
| 65 |
+
# Get the indices of similar movies (excluding the movie itself)
|
| 66 |
+
similar_indices = sim_scores.argsort()[::-1][1:]
|
| 67 |
+
|
| 68 |
+
# Ensure you don’t get more than `top_n` recommendations
|
| 69 |
+
similar_indices = similar_indices[:top_n]
|
| 70 |
+
|
| 71 |
+
# Get the movie titles
|
| 72 |
+
top_movies = movies.iloc[similar_indices]['title'].tolist()
|
| 73 |
+
|
| 74 |
+
return top_movies
|
| 75 |
+
|
| 76 |
+
# ======================== WORD2VEC ============================
|
| 77 |
+
def build_word2vec_vectors():
|
| 78 |
+
tokenized_corpus = [text.split() for text in movies['tokens']]
|
| 79 |
+
|
| 80 |
+
# Train Word2Vec model
|
| 81 |
+
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
|
| 82 |
+
|
| 83 |
+
# Function to compute average vector for each movie
|
| 84 |
+
def get_avg_vector(tokens):
|
| 85 |
+
vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
|
| 86 |
+
if len(vectors) == 0:
|
| 87 |
+
return np.zeros(w2v_model.vector_size)
|
| 88 |
+
return np.mean(vectors, axis=0)
|
| 89 |
+
|
| 90 |
+
# Compute average vector for each movie
|
| 91 |
+
movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']]
|
| 92 |
+
w2v_matrix = np.vstack(movies['w2v_vector'].values)
|
| 93 |
+
return w2v_model, w2v_matrix
|
| 94 |
+
|
| 95 |
+
def recommend_w2v(title, year, w2v_matrix, top_n=5):
|
| 96 |
+
if title not in movies['title'].values:
|
| 97 |
+
return "Movie not found."
|
| 98 |
+
|
| 99 |
+
idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
|
| 100 |
+
query_vec = w2v_matrix[idx].reshape(1, -1)
|
| 101 |
+
sims = cosine_similarity(query_vec, w2v_matrix).flatten()
|
| 102 |
+
|
| 103 |
+
# Get the indices of similar movies (excluding the movie itself)
|
| 104 |
+
similar_indices = sims.argsort()[::-1][1:]
|
| 105 |
+
|
| 106 |
+
# Ensure you don’t get more than `top_n` recommendations
|
| 107 |
+
similar_indices = similar_indices[:top_n]
|
| 108 |
+
|
| 109 |
+
# Get the movie titles
|
| 110 |
+
top_movies = movies.iloc[similar_indices]['title'].tolist()
|
| 111 |
+
|
| 112 |
+
return top_movies
|
| 113 |
+
|
| 114 |
+
# ======================== UI ========================
|
| 115 |
+
st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering')
|
| 116 |
+
|
| 117 |
+
select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec'])
|
| 118 |
+
selected_movie = st.selectbox('Select a movie', movies['title'].unique())
|
| 119 |
+
selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year'])
|
| 120 |
+
|
| 121 |
+
all_columns = movies.columns.tolist()
|
| 122 |
+
selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot'])
|
| 123 |
+
|
| 124 |
+
# Combine selected columns into tokens
|
| 125 |
+
movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1)
|
| 126 |
+
movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x))
|
| 127 |
+
|
| 128 |
+
if st.button('Recommend'):
|
| 129 |
+
# Display the selected movie and its columns
|
| 130 |
+
st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols])
|
| 131 |
+
|
| 132 |
+
# Recommendation based on the selected algorithm
|
| 133 |
+
if select_algo == 'Bag of Words':
|
| 134 |
+
with st.spinner("Building Bag of Words model..."):
|
| 135 |
+
vectorizer = CountVectorizer()
|
| 136 |
+
bow_matrix = vectorizer.fit_transform(movies['tokens'])
|
| 137 |
+
output = recommend_bow(selected_movie, selected_year, top_n=5)
|
| 138 |
+
st.success("Bag of Words model ready ✅")
|
| 139 |
+
|
| 140 |
+
output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True)
|
| 141 |
+
st.subheader("🎯 Top Recommendations (Bag of Words)")
|
| 142 |
+
st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols])
|
| 143 |
+
|
| 144 |
+
elif select_algo == 'TF-IDF':
|
| 145 |
+
with st.spinner("Building TF-IDF model..."):
|
| 146 |
+
vectorizer = TfidfVectorizer()
|
| 147 |
+
tfidf_matrix = vectorizer.fit_transform(movies['tokens'])
|
| 148 |
+
output = recommend_tfidf(selected_movie, selected_year, top_n=5)
|
| 149 |
+
st.success("TF-IDF model ready ✅")
|
| 150 |
+
st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
|
| 151 |
+
|
| 152 |
+
elif select_algo == 'Word2Vec':
|
| 153 |
+
with st.spinner("Training Word2Vec model..."):
|
| 154 |
+
w2v_model, w2v_matrix = build_word2vec_vectors()
|
| 155 |
+
output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5)
|
| 156 |
+
st.success("Word2Vec model ready ✅")
|
| 157 |
+
st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
|
data/cache/svd_incremental_cache.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58176c8e3545d4ade2701a7fdeca9d1495b864fa2a77bbc5db4272dfacb01eda
|
| 3 |
+
size 13812355
|
data/grouped_tags.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9269b5d520d4063ae57c3027ce0402dc755927ac11db46ea835c2321f6f071e
|
| 3 |
+
size 31065818
|
data/links.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/merger.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d566674daf655758a3c7d01e1ec31ff6a63064d8f1b6fa67f693e2badd8347e
|
| 3 |
+
size 21817265
|
data/movies.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/ratings.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0ab9d46f7ba3cdb7a00602f608491bcae6d479f1c5649d291d07eccb0289870
|
| 3 |
+
size 529039466
|
data/tags.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e4b043d9088ce7bdad9f4ba06901330f3c4adec6e448da83d9a5d32fc756d55
|
| 3 |
+
size 25847889
|
item_based.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# sys.path.append(os.path.dirname(__file__))
|
| 13 |
+
# data_dir = Path(__file__).parent.parent / 'data'
|
| 14 |
+
|
| 15 |
+
# # links dataset
|
| 16 |
+
# movies = data_dir / 'movies_final.csv'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
movies = pd.read_csv('movies_final.csv')
|
| 20 |
+
st.text(movies.shape)
|
| 21 |
+
st.dataframe(movies.head())
|
| 22 |
+
|
| 23 |
+
start = time.time()
|
| 24 |
+
|
| 25 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 26 |
+
movies['bert_vector'] = model.encode(movies['tokens'].tolist(), show_progress_bar=True).tolist()
|
| 27 |
+
|
| 28 |
+
def recommend_bert(title, top_n=5):
|
| 29 |
+
if title not in movies['title'].values:
|
| 30 |
+
return "Movie not found."
|
| 31 |
+
|
| 32 |
+
idx = movies[movies['title'] == title].index[0]
|
| 33 |
+
query_vector = np.array(movies.loc[idx, 'bert_vector']).reshape(1, -1)
|
| 34 |
+
all_vectors = np.vstack(movies['bert_vector'].values)
|
| 35 |
+
|
| 36 |
+
similarities = cosine_similarity(query_vector, all_vectors)[0]
|
| 37 |
+
sim_scores = list(enumerate(similarities))
|
| 38 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
| 39 |
+
|
| 40 |
+
# Skip the first result (it’s the movie itself)
|
| 41 |
+
recommended = [movies.iloc[i]['title'] for i, score in sim_scores[1:top_n+1]]
|
| 42 |
+
return recommended
|
| 43 |
+
|
| 44 |
+
out = recommend_bert('Schindler\'s List')
|
| 45 |
+
st.text(out)
|
| 46 |
+
|
| 47 |
+
end = time.time()
|
| 48 |
+
|
| 49 |
+
st.text(end-start)
|
main.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
# Define pages (by file or function)
|
| 4 |
+
page1 = st.Page("about_me.py", title="About Me", icon="👤")
|
| 5 |
+
page2 = st.Page("base.py", title="Home Page", icon="🏠")
|
| 6 |
+
page3 = st.Page("popularity.py", title="Popularity Based Filtering", icon="🔥")
|
| 7 |
+
page4 = st.Page("content_based.py", title="Content Based Filtering", icon="🎬")
|
| 8 |
+
page5 = st.Page("collaborative_user.py", title="Collaborative User based", icon="👥")
|
| 9 |
+
page6 = st.Page("collaborative_item.py", title="Collaborative Item based", icon="🛍️")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Create navigation
|
| 13 |
+
pg = st.navigation([page1, page2, page3, page4, page5, page6])
|
| 14 |
+
|
| 15 |
+
# Run navigation
|
| 16 |
+
pg.run()
|
movies_final.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77a03ffbfcc887eef7e34b750ec446233b09b2e70471eae138bbb094c0c09cda
|
| 3 |
+
size 24885151
|
nltk_data/corpora/stopwords.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:174365e185ab7b343a4ca41ac9d1b44f03c928a226eb720afb027882865e07c0
|
| 3 |
+
size 823
|
nltk_data/corpora/stopwords/english
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
a
|
| 2 |
+
about
|
| 3 |
+
above
|
| 4 |
+
after
|
| 5 |
+
again
|
| 6 |
+
against
|
| 7 |
+
ain
|
| 8 |
+
all
|
| 9 |
+
am
|
| 10 |
+
an
|
| 11 |
+
and
|
| 12 |
+
any
|
| 13 |
+
are
|
| 14 |
+
aren
|
| 15 |
+
aren't
|
| 16 |
+
as
|
| 17 |
+
at
|
| 18 |
+
be
|
| 19 |
+
because
|
| 20 |
+
been
|
| 21 |
+
before
|
| 22 |
+
being
|
| 23 |
+
below
|
| 24 |
+
between
|
| 25 |
+
both
|
| 26 |
+
but
|
| 27 |
+
by
|
| 28 |
+
can
|
| 29 |
+
couldn
|
| 30 |
+
couldn't
|
| 31 |
+
d
|
| 32 |
+
did
|
| 33 |
+
didn
|
| 34 |
+
didn't
|
| 35 |
+
do
|
| 36 |
+
does
|
| 37 |
+
doesn
|
| 38 |
+
doesn't
|
| 39 |
+
doing
|
| 40 |
+
don
|
| 41 |
+
don't
|
| 42 |
+
down
|
| 43 |
+
during
|
| 44 |
+
each
|
| 45 |
+
few
|
| 46 |
+
for
|
| 47 |
+
from
|
| 48 |
+
further
|
| 49 |
+
had
|
| 50 |
+
hadn
|
| 51 |
+
hadn't
|
| 52 |
+
has
|
| 53 |
+
hasn
|
| 54 |
+
hasn't
|
| 55 |
+
have
|
| 56 |
+
haven
|
| 57 |
+
haven't
|
| 58 |
+
having
|
| 59 |
+
he
|
| 60 |
+
he'd
|
| 61 |
+
he'll
|
| 62 |
+
her
|
| 63 |
+
here
|
| 64 |
+
hers
|
| 65 |
+
herself
|
| 66 |
+
he's
|
| 67 |
+
him
|
| 68 |
+
himself
|
| 69 |
+
his
|
| 70 |
+
how
|
| 71 |
+
i
|
| 72 |
+
i'd
|
| 73 |
+
if
|
| 74 |
+
i'll
|
| 75 |
+
i'm
|
| 76 |
+
in
|
| 77 |
+
into
|
| 78 |
+
is
|
| 79 |
+
isn
|
| 80 |
+
isn't
|
| 81 |
+
it
|
| 82 |
+
it'd
|
| 83 |
+
it'll
|
| 84 |
+
it's
|
| 85 |
+
its
|
| 86 |
+
itself
|
| 87 |
+
i've
|
| 88 |
+
just
|
| 89 |
+
ll
|
| 90 |
+
m
|
| 91 |
+
ma
|
| 92 |
+
me
|
| 93 |
+
mightn
|
| 94 |
+
mightn't
|
| 95 |
+
more
|
| 96 |
+
most
|
| 97 |
+
mustn
|
| 98 |
+
mustn't
|
| 99 |
+
my
|
| 100 |
+
myself
|
| 101 |
+
needn
|
| 102 |
+
needn't
|
| 103 |
+
no
|
| 104 |
+
nor
|
| 105 |
+
not
|
| 106 |
+
now
|
| 107 |
+
o
|
| 108 |
+
of
|
| 109 |
+
off
|
| 110 |
+
on
|
| 111 |
+
once
|
| 112 |
+
only
|
| 113 |
+
or
|
| 114 |
+
other
|
| 115 |
+
our
|
| 116 |
+
ours
|
| 117 |
+
ourselves
|
| 118 |
+
out
|
| 119 |
+
over
|
| 120 |
+
own
|
| 121 |
+
re
|
| 122 |
+
s
|
| 123 |
+
same
|
| 124 |
+
shan
|
| 125 |
+
shan't
|
| 126 |
+
she
|
| 127 |
+
she'd
|
| 128 |
+
she'll
|
| 129 |
+
she's
|
| 130 |
+
should
|
| 131 |
+
shouldn
|
| 132 |
+
shouldn't
|
| 133 |
+
should've
|
| 134 |
+
so
|
| 135 |
+
some
|
| 136 |
+
such
|
| 137 |
+
t
|
| 138 |
+
than
|
| 139 |
+
that
|
| 140 |
+
that'll
|
| 141 |
+
the
|
| 142 |
+
their
|
| 143 |
+
theirs
|
| 144 |
+
them
|
| 145 |
+
themselves
|
| 146 |
+
then
|
| 147 |
+
there
|
| 148 |
+
these
|
| 149 |
+
they
|
| 150 |
+
they'd
|
| 151 |
+
they'll
|
| 152 |
+
they're
|
| 153 |
+
they've
|
| 154 |
+
this
|
| 155 |
+
those
|
| 156 |
+
through
|
| 157 |
+
to
|
| 158 |
+
too
|
| 159 |
+
under
|
| 160 |
+
until
|
| 161 |
+
up
|
| 162 |
+
ve
|
| 163 |
+
very
|
| 164 |
+
was
|
| 165 |
+
wasn
|
| 166 |
+
wasn't
|
| 167 |
+
we
|
| 168 |
+
we'd
|
| 169 |
+
we'll
|
| 170 |
+
we're
|
| 171 |
+
were
|
| 172 |
+
weren
|
| 173 |
+
weren't
|
| 174 |
+
we've
|
| 175 |
+
what
|
| 176 |
+
when
|
| 177 |
+
where
|
| 178 |
+
which
|
| 179 |
+
while
|
| 180 |
+
who
|
| 181 |
+
whom
|
| 182 |
+
why
|
| 183 |
+
will
|
| 184 |
+
with
|
| 185 |
+
won
|
| 186 |
+
won't
|
| 187 |
+
wouldn
|
| 188 |
+
wouldn't
|
| 189 |
+
y
|
| 190 |
+
you
|
| 191 |
+
you'd
|
| 192 |
+
you'll
|
| 193 |
+
your
|
| 194 |
+
you're
|
| 195 |
+
yours
|
| 196 |
+
yourself
|
| 197 |
+
yourselves
|
| 198 |
+
you've
|
nltk_data/corpora/wordnet.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
|
| 3 |
+
size 10775600
|
nltk_data/tokenizers/punkt.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
|
| 3 |
+
size 13905355
|
nltk_data/tokenizers/punkt/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
nltk_data/tokenizers/punkt/PY3/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt/PY3/english.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
|
| 3 |
+
size 406697
|
nltk_data/tokenizers/punkt/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt/english.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
|
| 3 |
+
size 433305
|
popularity.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ====================== Streamlit Setup ======================
|
| 12 |
+
st.set_page_config(layout="wide")
|
| 13 |
+
st.title("Popularity Based Filtering")
|
| 14 |
+
st.markdown('---')
|
| 15 |
+
|
| 16 |
+
# ====================== Data Extraction ======================
|
| 17 |
+
|
| 18 |
+
sys.path.append(os.path.dirname(__file__))
|
| 19 |
+
data_dir = Path(__file__).parent / 'data'
|
| 20 |
+
|
| 21 |
+
movies = 'movies_final.csv'
|
| 22 |
+
movies = pd.read_csv(movies)
|
| 23 |
+
movies = movies[['movieId','title']]
|
| 24 |
+
|
| 25 |
+
# ratings dataset
|
| 26 |
+
ratings = data_dir / 'ratings.csv'
|
| 27 |
+
ratings = st.session_state['ratings_df']
|
| 28 |
+
# ratings = ratings.drop(columns=['timestamp'])
|
| 29 |
+
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
st.subheader('👀 Glance of the dataset')
|
| 33 |
+
col1 ,col2 = st.columns(2)
|
| 34 |
+
with col1:
|
| 35 |
+
st.markdown("Ratings Dataset")
|
| 36 |
+
st.dataframe(ratings.head())
|
| 37 |
+
with col2:
|
| 38 |
+
st.markdown("Movies Dataset")
|
| 39 |
+
st.dataframe(movies.head())
|
| 40 |
+
st.markdown('---')
|
| 41 |
+
|
| 42 |
+
# ============================= Most Rated Movies ====================================================
|
| 43 |
+
col1, col2 = st.columns(2)
|
| 44 |
+
|
| 45 |
+
most_rated = ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
|
| 46 |
+
most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
|
| 47 |
+
most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 48 |
+
movie_ids = most_rated_final['movieId'].tolist()
|
| 49 |
+
|
| 50 |
+
avg_ratings = ratings[ratings['movieId'].isin(movie_ids)]
|
| 51 |
+
avg_ratings = avg_ratings.groupby('movieId')['rating'].mean().reset_index().rename(columns={'rating': 'Avg_Rating'})
|
| 52 |
+
most_rated_final = pd.merge(most_rated_final, avg_ratings, how='left', on='movieId')
|
| 53 |
+
|
| 54 |
+
with col1:
|
| 55 |
+
st.subheader('📈 Top 5 Most Rated Movies')
|
| 56 |
+
st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
|
| 57 |
+
|
| 58 |
+
# ============================= Least Rated Movies ====================================================
|
| 59 |
+
most_rated = ratings.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).rename(
|
| 60 |
+
columns={'rating': 'Avg_Rating', 'userId': 'No_of_Ratings'}
|
| 61 |
+
)
|
| 62 |
+
most_rated = most_rated.sort_values(['No_of_Ratings','Avg_Rating']).head()
|
| 63 |
+
most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 64 |
+
|
| 65 |
+
with col2:
|
| 66 |
+
st.subheader('📉 Top 5 Least Rated Movies')
|
| 67 |
+
st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
|
| 68 |
+
|
| 69 |
+
# ============================= Highest Average Rated Movies ==========================================
|
| 70 |
+
st.markdown('---')
|
| 71 |
+
|
| 72 |
+
col1, col2 = st.columns(2)
|
| 73 |
+
|
| 74 |
+
filter_ratings = ratings[ratings['rating'] == 5]
|
| 75 |
+
most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
|
| 76 |
+
most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
|
| 77 |
+
most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 78 |
+
|
| 79 |
+
filter_ratings = ratings[ratings['rating'] == 0.5]
|
| 80 |
+
most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
|
| 81 |
+
most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
|
| 82 |
+
least_most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
with col1:
|
| 86 |
+
st.subheader('Highest Rated Movies with 5⭐')
|
| 87 |
+
st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
|
| 88 |
+
st.subheader('Highest Rated Movies with 0.5⭐')
|
| 89 |
+
st.dataframe(least_most_rated_final[['title', 'No_of_Ratings']])
|
| 90 |
+
|
| 91 |
+
# ============================= Ratings Distribution Graph ============================================
|
| 92 |
+
import matplotlib.pyplot as plt
|
| 93 |
+
|
| 94 |
+
# col1, col2 = st.columns(2)
|
| 95 |
+
|
| 96 |
+
with col2:
|
| 97 |
+
st.subheader('📊 Ratings Distribution')
|
| 98 |
+
graph = ratings.groupby('rating').count()['userId']
|
| 99 |
+
fig, ax = plt.subplots()
|
| 100 |
+
ax.bar(graph.index, graph.values)
|
| 101 |
+
ax.set_xlabel('Rating')
|
| 102 |
+
ax.set_ylabel('Number of Ratings')
|
| 103 |
+
ax.set_title('Number of Ratings by Rating Value')
|
| 104 |
+
st.pyplot(fig)
|
| 105 |
+
|
| 106 |
+
st.markdown('---')
|
| 107 |
+
|
| 108 |
+
# ============================= User Selected Ratings ===================================================
|
| 109 |
+
st.subheader('🎯 Movies Filtered by Selected Rating')
|
| 110 |
+
ratings_selected = st.selectbox(
|
| 111 |
+
'Select the Rating',
|
| 112 |
+
pd.Series(ratings['rating'].unique()).sort_values(ascending=False).tolist()
|
| 113 |
+
)
|
| 114 |
+
filter_ratings = ratings[ratings['rating'] == ratings_selected]
|
| 115 |
+
|
| 116 |
+
col1, col2 = st.columns(2)
|
| 117 |
+
|
| 118 |
+
with col1:
|
| 119 |
+
st.markdown('**Most Rated for Selected Rating**')
|
| 120 |
+
most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
|
| 121 |
+
most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
|
| 122 |
+
most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 123 |
+
st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
|
| 124 |
+
with col2:
|
| 125 |
+
st.markdown('**Least Rated for Selected Rating**')
|
| 126 |
+
most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values().head()
|
| 127 |
+
most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
|
| 128 |
+
most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
|
| 129 |
+
st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
joblib==1.4.2
|
| 2 |
+
matplotlib==3.10.7
|
| 3 |
+
nltk==3.9.2
|
| 4 |
+
pandas==2.3.3
|
| 5 |
+
scikit_learn==1.7.2
|
| 6 |
+
sentence_transformers==5.1.1
|
| 7 |
+
streamlit==1.50.0
|
| 8 |
+
gensim==4.3.3
|
| 9 |
+
numpy==1.23.5
|
| 10 |
+
scipy==1.13.0
|