SnehaJais commited on Oct 31, 2025

Commit

c37645b

verified ·

1 Parent(s): 4e35fb5

Initial Commit

Browse files

Files changed (27) hide show

.gitattributes +5 -0
about_me.py +162 -0
base.py +603 -0
collaborative_item.py +211 -0
collaborative_user.py +235 -0
content_based.py +157 -0
data/cache/svd_incremental_cache.joblib +3 -0
data/grouped_tags.csv +3 -0
data/links.csv +0 -0
data/merger.csv +3 -0
data/movies.csv +0 -0
data/ratings.csv +3 -0
data/tags.csv +3 -0
item_based.py +49 -0
main.py +16 -0
movies_final.csv +3 -0
nltk_data/corpora/stopwords.zip +3 -0
nltk_data/corpora/stopwords/english +198 -0
nltk_data/corpora/wordnet.zip +3 -0
nltk_data/tokenizers/punkt.zip +3 -0
nltk_data/tokenizers/punkt/.DS_Store +0 -0
nltk_data/tokenizers/punkt/PY3/README +98 -0
nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
nltk_data/tokenizers/punkt/README +98 -0
nltk_data/tokenizers/punkt/english.pickle +3 -0
popularity.py +129 -0
requirements.txt +10 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/grouped_tags.csv filter=lfs diff=lfs merge=lfs -text
+data/merger.csv filter=lfs diff=lfs merge=lfs -text
+data/ratings.csv filter=lfs diff=lfs merge=lfs -text
+data/tags.csv filter=lfs diff=lfs merge=lfs -text
+movies_final.csv filter=lfs diff=lfs merge=lfs -text

about_me.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import streamlit as st
+#===============================
+import pandas as pd
+from pathlib import Path
+import os
+import sys
+# Define the data directory outside the function
+data_dir = Path(__file__).parent / 'data'
+# 1. Use st.cache_data to run this function only once
+@st.cache_data
+def load_ratings_data():
+    chunks = []
+    # Using nullable integers ('Int32') is correct
+    dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
+    # Add a spinner to show the user the file is loading
+    with st.spinner('Loading ratings data (please bear with me)...'):
+        for chunk in pd.read_csv(
+            data_dir / 'ratings.csv',
+            dtype=dtype_ratings,
+            chunksize=1_000_000
+        ):
+            # Drop rows where 'userId' or 'movieId' is NaN (as requested)
+            chunk = chunk.dropna(subset=['userId', 'movieId'])
+            chunks.append(chunk)
+    # Concatenate all chunks into the final dataframe
+    ratings = pd.concat(chunks, ignore_index=True)
+    return ratings
+# 2. Call the function to load and cache the data.
+#    This DataFrame is now available globally or can be passed to functions.
+ratings_df = load_ratings_data()
+st.session_state['ratings_df'] = ratings_df # Use session_state to share it easily
+#==============================
+# Basic page styling
+st.markdown(
+    """
+    <style>
+        .stApp {
+            background-color: white;
+            color: #116A91;
+        }
+        .title {
+            text-align: center;
+            color: #116A91;
+            font-size: 36px;
+            margin-bottom: 10px;
+        }
+        .subtitle {
+            text-align: center;
+            color: #444444;
+            font-size: 18px;
+            margin-bottom: 30px;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 20px;
+            margin-bottom: 20px;
+            border-radius: 6px;
+        }
+        .header {
+            color: #116A91;
+            font-size: 20px;
+            margin-bottom: 10px;
+        }
+        .content {
+            color: #333333;
+            font-size: 16px;
+            line-height: 1.6;
+        }
+    </style>
+    """, unsafe_allow_html=True
+)
+# Page Title and Subtitle
+st.markdown("<div class='title'>Movie Recommender System</div>", unsafe_allow_html=True)
+st.markdown("<div class='subtitle'>Discover movies you'll love using different recommendation techniques 🎬</div>", unsafe_allow_html=True)
+# Creator Section
+st.markdown("""
+<div class='section'>
+    <div class='header'>👤 About the Creator</div>
+    <div class='content'>
+        <strong>SJ</strong> is the creator of this Movie Recommender System.
+        <br><br>
+        Currently working as a <strong>Product Manager</strong>, SJ specializes in leading cross-functional teams to build and launch user-centric digital products. With a strong foundation in both business strategy and data, SJ is passionate about solving real-world problems through thoughtful product design.
+        <br><br>
+        Prior to becoming a Product Manager, SJ held roles as:
+        <ul>
+            <li><strong>Data Analyst</strong> – Experienced in SQL, Excel, Tableau, Python, Consumer Bureau data, and Microfinance analytics</li>
+            <li><strong>Data Engineer</strong> – Worked with IBM DataStage, Informatica, Ataccama, and Collibra</li>
+        </ul>
+        This diverse background is what inspired SJ to build tools like this — combining the power of data with simple, intuitive products.
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# Purpose Section
+st.markdown("""
+<div class='section'>
+    <div class='header'>📌 Purpose of This App</div>
+    <div class='content'>
+        This Movie Recommender System helps users discover movies they might enjoy based on different recommendation techniques:
+        <ul>
+            <li>🔥 Popularity Based Filtering – Recommend trending movies</li>
+            <li>🎬 Content-Based Filtering – Recommend movies similar to your favorites</li>
+            <li>👥 Collaborative User-Based – Recommend movies liked by users similar to you</li>
+            <li>🛍️ Collaborative Item-Based – Recommend movies similar to ones you've rated highly</li>
+        </ul>
+        The goal is to provide a simple, interactive way to explore movie recommendations tailored to your preferences.
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# Technology Section
+st.markdown("""
+<div class='section'>
+    <div class='header'>🛠️ Technology Used</div>
+    <div class='content'>
+        The application is developed using <strong>Streamlit</strong>, a lightweight and efficient Python framework for building data apps.
+        <br><br>
+        Key Python libraries used include <strong>Pandas</strong>, <strong>NumPy</strong>, <strong>Scikit-Learn</strong>, and <strong>Scipy</strong> for data processing and recommendation algorithms.
+    </div>
+</div>
+""", unsafe_allow_html=True)
+# Features Section
+st.markdown("""
+<div class='section'>
+    <div class='header'>🔍 What You Can Do with It</div>
+    <div class='content'>
+        With this app, users can:
+        <ul>
+            <li>Explore movie recommendations based on popularity, content similarity, or collaborative filtering</li>
+            <li>View detailed recommendations for a specific user</li>
+            <li>Filter movies based on ratings and user activity</li>
+            <li>Understand hidden patterns in user preferences using latent features</li>
+        </ul>
+        The system helps both casual viewers and movie enthusiasts find movies they are likely to enjoy.
+    </div>
+</div>
+""", unsafe_allow_html=True)
+ # Footer / Call-to-action
+st.markdown("""
+<div class='section'>
+    <div class='header'>🎯 Ready to Discover Movies?</div>
+    <div class='content'>
+        Choose a recommendation technique from the sidebar and start exploring movies tailored to your tastes.
+        <br><br>
+        Enjoy discovering new favorites!
+    </div>
+</div>
+""", unsafe_allow_html=True)

base.py ADDED Viewed

	@@ -0,0 +1,603 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+import sys
+import os
+import time
+from pathlib import Path
+import io
+import nltk
+import string
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+# nltk.download('punkt')
+# nltk.download('stopwords')
+# nltk.download('wordnet')
+nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
+os.makedirs(nltk_data_dir, exist_ok=True)
+nltk.data.path.append(nltk_data_dir)
+def download_nltk_resources():
+    resources = ['wordnet', 'stopwords', 'punkt']
+    for res in resources:
+        try:
+            nltk.data.find(res)
+        except LookupError:
+            try:
+                nltk.download(res, download_dir=nltk_data_dir)
+            except Exception as e:
+                st.error(f"Error downloading {res}: {e}")
+# ====================== Streamlit Setup ======================
+st.set_page_config(layout="wide")
+st.title("Data Preprocessing")
+st.markdown('---')
+# ====================== Importing the datasets ===========================
+sys.path.append(os.path.dirname(__file__))
+data_dir = Path(__file__).parent / 'data'
+# links dataset
+links = data_dir / 'links.csv'
+links = pd.read_csv(links)
+# tags dataset
+tags = data_dir / 'tags.csv'
+tags = pd.read_csv(tags)
+# movies dataset
+movies = data_dir / 'movies.csv'
+movies = pd.read_csv(movies)
+# ratings dataset
+# chunks = []
+# dtype_ratings = {'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'}
+# for chunk in pd.read_csv(data_dir / 'ratings.csv', dtype=dtype_ratings, chunksize=1_000_000):
+#     chunk = chunk.dropna(subset=['userId', 'movieId'])
+#     chunks.append(chunk)
+# # Concatenate all chunks into the final dataframe
+# ratings = pd.concat(chunks, ignore_index=True)
+ratings = st.session_state['ratings_df']
+# ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
+# ratings = ratings.drop(columns=['timestamp'], errors='ignore')
+# loc = data_dir / 'ratings.csv'
+# ratings.to_csv(loc,index=False)
+#============================ Descriptions of the datasets ===========================
+st.subheader("ℹ️ MovieLens Datasets")
+st.markdown("""
+The **MovieLens dataset** is a benchmark dataset widely used for building and testing movie recommendation systems.
+Below is a brief overview of the four main data files included:
+""")
+# --- Row 1: Movies and Tags ---
+row1_col1, row1_col2 = st.columns(2)
+with row1_col1:
+    st.subheader("🎬 movies.csv")
+    st.markdown("""
+    Contains metadata about each movie.
+    **Columns:**
+    - 'movieId': Unique identifier for each movie
+    - 'title': Movie title (usually includes release year)
+    - 'genres': Pipe-separated list of genres (e.g., *Action|Comedy|Drama*)
+    """)
+with row1_col2:
+    st.subheader("🏷️ tags.csv")
+    st.markdown("""
+    Contains user-generated movie tags.
+    **Columns:**
+    - 'userId': User who applied the tag
+    - 'movieId': Tagged movie
+    - 'tag': User-generated keyword or phrase
+    - 'timestamp': When the tag was added (UNIX time)
+    """)
+# --- Row 2: Ratings and Links ---
+row2_col1, row2_col2 = st.columns(2)
+with row2_col1:
+    st.subheader("⭐ ratings.csv")
+    st.markdown("""
+    Contains user ratings for movies.
+    **Columns:**
+    - 'userId': Unique identifier for each user
+    - 'movieId': Rated movie
+    - 'rating': Rating given (typically from 0.5 to 5.0)
+    - 'timestamp': When the rating was recorded (UNIX time)
+    """)
+    st.markdown('*Please note that the "ratings" folder is quite large, and I have already performed some preprocessing on it. In this section, I will only be demonstrating the steps involved in the preprocessing process.*')
+with row2_col2:
+    st.subheader("🔗 links.csv")
+    st.markdown("""
+    Maps MovieLens movies to external movie databases.
+    **Columns:**
+    - 'movieId': Unique identifier for each movie
+    - 'imdbId': IMDb movie ID (for cross-referencing)
+    - 'tmdbId': TMDb movie ID (for fetching posters or metadata)
+    """)
+st.markdown('---')
+# ========================= Displaying the dataset ===================================
+st.subheader('📊 Sample Datasets Preview')
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader('Movies dataset')
+    st.text(f"Shape : {movies.shape}")
+    st.dataframe(movies.head())
+with col2:
+    st.subheader('Tags dataset')
+    st.text(f"Shape : {tags.shape}")
+    st.dataframe(tags.head())
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader('Ratings dataset')
+    st.text(f"Shape : {ratings.shape}")
+    st.dataframe(ratings.head())
+with col2:
+    st.subheader('Links dataset')
+    st.text(f"Shape : {links.shape}")
+    st.dataframe(links.head())
+st.markdown('---')
+# ====================================================== Preprocessing ================================================================
+# =============================dropping not required columns=============================
+st.subheader('📉 Dropping the columns that are not required')
+st.text("Dropped 'timestamp' and 'tmdbId' columns from Ratings and Links datasets")
+links = links.drop(columns = ['tmdbId'])
+saved_ratings_movie_unique = ratings['movieId'].nunique()
+saved_ratings_duplicates = ratings[ratings.duplicated()].shape[0]
+st.text('After Removal')
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader('Ratings dataset')
+    st.text(f"Shape : {ratings.shape}")
+    st.dataframe(ratings.head())
+with col2:
+    st.subheader('Links dataset')
+    st.text(f"Shape : {links.shape}")
+    st.dataframe(links.head())
+st.markdown('---')
+# =============================Information about the dataset=============================
+st.subheader('ℹ️ Dataset Information')
+col1, col2 = st.columns(2)
+with col1:
+    st.text('Movies')
+    movies_info = pd.DataFrame({
+        "Column Name": movies.columns,
+        "Non-Null Count": movies.notnull().sum().values,
+        "Data Type": movies.dtypes.values
+    })
+    st.dataframe(movies_info)
+    st.text('Ratings')
+    ratings_info = pd.DataFrame({
+        "Column Name": ratings.columns,
+        "Non-Null Count": ratings.notnull().sum().values,
+        "Data Type": ratings.dtypes.values
+    })
+    st.dataframe(ratings_info)
+with col2:
+    st.text('Tags')
+    tags_info = pd.DataFrame({
+        "Column Name": tags.columns,
+        "Non-Null Count": tags.notnull().sum().values,
+        "Data Type": tags.dtypes.values
+    })
+    st.dataframe(tags_info)
+    st.text('Links')
+    links_info = pd.DataFrame({
+        "Column Name": links.columns,
+        "Non-Null Count": links.notnull().sum().values,
+        "Data Type": links.dtypes.values
+    })
+    st.dataframe(links_info)
+st.markdown('---')
+del(ratings)
+# ============================= Duplicates hai ya nahi =============================
+st.subheader('🧐 Checking for Duplicates')
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    st.text('Movies')
+    st.text(f"# Duplicates found : {movies[movies.duplicated()].shape[0]}")
+with col2:
+    st.text('Ratings')
+    st.text(f"# Duplicates found : {saved_ratings_duplicates}")
+with col3:
+    st.text('Tags')
+    st.text(f"# Duplicates found : {tags[tags.duplicated()].shape[0]}")
+    st.table(tags[tags.duplicated()])
+with col4:
+    st.text('Links')
+    st.text(f"# Duplicates found : {links[links.duplicated()].shape[0]}")
+st.markdown("*Removing duplicate rows detected in the dataset.*")
+st.markdown("---")
+# ============================= UNique counts =============================
+st.subheader('🔍 Checking for consistency across Dataframes')
+st.text(f"Unique movies in Movies Dataset : {movies['movieId'].nunique()}")
+st.text(f"Unique movies in Tags Dataset : {tags['movieId'].nunique()}")
+st.text(f"Unique movies in Ratings Dataset : {saved_ratings_movie_unique}")
+st.text(f"Unique movies in Links Dataset : {links['movieId'].nunique()}")
+st.text(f"Unique imdbId in Links Dataset : {links['imdbId'].nunique()}")
+st.markdown("*The Tags dataset has tags missing for most of the movies*")
+st.markdown("---")
+# ============================= Grouping the tags data ====================================
+st.subheader('Merging Tags and Links Datasets')
+st.markdown("""
+- Grouping the Tags dataset based on 'movieId' and merging with the Movies dataset
+- Merging the Links dataset with the Movies dataset based on 'imdbId'
+""")
+# grouping tags data
+grouped_tags = tags.groupby('movieId')['tag'].agg(list).reset_index()
+del(tags)
+# Merging the datasets
+movies_tag = pd.merge(movies,grouped_tags, how = 'left', on= 'movieId')
+movies_tag = pd.merge(movies_tag,links, how = 'left', on= 'movieId')
+# movies_tag.to_csv('tags_check.csv')
+st.dataframe(movies_tag.head())
+del(links)
+del(movies)
+st.markdown('---')
+# ============================= Extracting Year and Title =============================
+# st.text('Extracting the Year from the Title of the movies')
+import re
+def extract_year_and_clean_title(title):
+    # Use regex to find all years in parentheses and extract the last one
+    years = re.findall(r'\((\d{4})\)', title)
+    if years:
+        # The last year is the one we want
+        year = int(years[-1])
+        # Remove the last year (and parentheses) from the title
+        cleaned_title = title.replace(f"({years[-1]})", "").strip()
+        # Optionally, remove any extra parentheses and clean up
+        cleaned_title = re.sub(r'\(.*\)', '', cleaned_title).strip()
+        return year, cleaned_title
+    return 0, title  # Return None if no year is found
+# Apply the function to the 'title' column to create 'year' and 'cleaned_title' columns
+movies_tag[['year', 'title']] = movies_tag['title'].apply(lambda x: pd.Series(extract_year_and_clean_title(x)))
+# ============================= Removing the data after the parenthesis. trimming the title =============================
+def bracket_title(title):
+    sp_title = title.split(' (')
+    return sp_title[0]
+movies_tag['bracket_title'] = movies_tag['title'].apply(bracket_title)
+def imp_title(title):
+    sp_title = title.split(',')
+    return len(sp_title)
+movies_tag['imp_title'] = movies_tag['bracket_title'].apply(imp_title)
+def rearrange_last_word(text):
+    # Split by comma, preserving the commas
+    parts = text.split(', ')
+    # If there is more than one part, rearrange
+    if len(parts) > 1:
+        last_word = parts[-1]  # Get the last part after the last comma
+        remaining_parts = parts[:-1]  # Get the rest of the parts
+        # Move the last part to the front and join them with commas, no extra comma before first part
+        rearranged_text = f'{last_word} ' + ', '.join(remaining_parts)
+    else:
+        rearranged_text = text  # If there's only one part, return as is
+    return rearranged_text
+movies_tag['clean_title'] = movies_tag['bracket_title'].apply(rearrange_last_word)
+# st.text(movies_tag['year'].unique())
+movies_tag['year'] = movies_tag['year'].astype(int)
+movies_tag = movies_tag[movies_tag['imp_title'].isin([1,2])]
+movies_tag['genres'] = movies_tag['genres'].str.replace('|',',')
+import ast
+def to_list_conv(row):
+    return row.split(',')
+movies_tag['genres'] = movies_tag['genres'].apply(to_list_conv)
+st.subheader('✅ After cleaning & merging the datasets')
+st.markdown("""
+-  Extracting the Year from the Title of the movies
+-  Converting the Year column to integer data type
+-  Replacing non-integer values in the Year column with 0
+-  Converting the Genres column to a list after replacing the '|' separator
+""")
+st.dataframe(movies_tag[['movieId','clean_title','year','genres','tag']].head(100))
+st.markdown("""- 📌 Number of movies with missing tags: {0}""".format(movies_tag['tag'].isna().sum()))
+st.markdown('---')
+# ============================================ Plot Dataset extraction =========================================
+st.subheader("🎬 Extracting Movie Details from IMDb")
+st.text("Extracting the Plot, Year & Genres of the movies from the 'imdbinfo' package with the help of the 'imdbID' column in the Links dataset")
+movies_merge = data_dir / 'merger.csv'
+movies_merge = pd.read_csv(movies_merge)
+st.dataframe(movies_merge[['title','year','plot','genres']])
+st.text(movies_merge.shape)
+# ============================================ Plot Dataset Check =========================================
+st.markdown('- **Checking for NULL values in the Dataset - Match not found during the extraction process**')
+col1,col2 = st.columns(2)
+with col1:
+    st.text('# Empty fields')
+    st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
+with col2:
+    st.text('After removing the rows with empty title - no match found')
+    movies_merge = movies_merge.dropna(subset=['title'])
+    st.dataframe(movies_merge[['title','year','plot','genres']].isna().sum())
+st.markdown("- **Same as in the Movies dataset converting the Year column to Integer and replacing NULLs with 0**")
+# st.text(movies_merge['year'].unique())
+movies_merge['year'] = movies_merge['year'].fillna(0)
+movies_merge['year'] = movies_merge['year'].astype(int)
+movies_merge['title'] = movies_merge['title'].str.strip()
+    # - Null IMDB Ids (No match found) : {movies_merge['imdbId'].isna().sum()}
+st.markdown(f"""
+- **Checking for duplicates in the extracted file keeping the NULLs aside**
+    - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
+    - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
+    - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
+    - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
+""")
+st.markdown('- **Checking why are there Duplicates present in the title column**')
+st.dataframe(movies_merge[movies_merge[['title', 'year']].duplicated(keep=False)].sort_values('title').reset_index(drop=True))
+st.text('Because of difference in year the titles are duplicated or because of multiple description for the plot field')
+st.markdown("- **Removing the Duplicates basis the Title and the Year column**")
+movies_merge = movies_merge.drop_duplicates(subset=['title', 'year'], keep='first')
+st.markdown(f"""
+- **Checking for duplicates in the extracted file keeping the NULLs aside**
+    - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
+    - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
+    - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
+    - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
+""")
+st.markdown('- **Checking why are there Duplicates present in the plot column apart from NULL values**')
+st.dataframe(movies_merge[movies_merge['plot'].notna() & movies_merge['plot'].duplicated(keep=False)].sort_values('plot').reset_index(drop=True))
+movies_merge = movies_merge[~movies_merge['plot'].duplicated(keep=False)]
+st.markdown("- **Removing the Duplicates basis the Plot column as we dont know for which movie the plot is correct**")
+# movies_merge.drop_duplicates(subset='plot', keep='first', inplace=True)
+st.markdown(f"""
+- **Checking for duplicates in the extracted file keeping the NULLs aside**
+    - Duplicate Title : {movies_merge['title'].dropna().duplicated().sum()}
+    - Duplicate Title + Year : {movies_merge.dropna(subset=['title', 'year']).duplicated(['title', 'year']).sum()}
+    - Duplicate Title + Year + Plot : {movies_merge.dropna(subset=['title', 'year', 'plot']).duplicated(['title', 'year', 'plot']).sum()}
+    - Duplicate Plot : {movies_merge['plot'].dropna().duplicated().sum()}
+""")
+st.markdown('---')
+# ========================================= merging with the main dataset =================================
+st.subheader('🧠 Merging the Dataset for the Plot and Genres columns')
+st.text('Joined on Title and Year column')
+# movies_tag.to_csv('chek_movies_tag.csv')
+# movies_merge.to_csv('chek_movies_merge.csv')
+movies_tag = pd.merge(movies_tag,movies_merge, how = 'left', left_on=['clean_title', 'year'], right_on=['title', 'year'])
+movies_tag = movies_tag.reset_index(drop=True)
+# movies_tag.to_csv('chek_movies_tag_after_join.csv')
+# ========================================= Converting to meaningfull tags =================================
+st.subheader("Cleaning and Merging Movie Tags & Genres")
+st.markdown("""
+We are performing the following preprocessing steps on the dataset:
+- **Standardizing genres and tags**
+   - Convert all entries in 'genres_x', 'genres_y', and 'tag' columns to lowercase.
+   - Ensure only valid string items are kept and handle empty or missing lists.
+""")
+movies_tag['genres_x'] = movies_tag['genres_x'].apply(
+    lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
+)
+movies_tag['tag'] = movies_tag['tag'].apply(
+    lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
+)
+st.markdown("""
+- **Converting string representations to lists for 'genres_y'**
+   - Some entries might be stored as strings, so we safely convert them to Python lists.
+   - Lowercase all valid genre entries.
+""")
+def to_list_conv(x):
+    if pd.isna(x):
+        return []
+    try:
+        return ast.literal_eval(x)
+    except (ValueError, SyntaxError):
+        return []
+movies_tag['genres_y'] = movies_tag['genres_y'].apply(to_list_conv)
+movies_tag['genres_y'] = movies_tag['genres_y'].apply(
+    lambda x: [item.lower() for item in x if isinstance(item, str)] if isinstance(x, list) and x else []
+)
+st.markdown("""
+- **Combining all genres into a single column**
+   - Merge 'genres_x' and 'genres_y' into a unified 'genres' column.
+   - Remove duplicate genres per movie.
+   - Drop the original 'genres_x' and 'genres_y' columns.
+""")
+movies_tag['genres'] = movies_tag.apply(lambda row: list(set(row['genres_x'] + row['genres_y'])), axis=1)
+movies_tag['tag'] = movies_tag.apply(lambda row: list(set(row['tag'])), axis=1)
+movies_tag = movies_tag.drop(columns=['genres_x','genres_y'])
+st.subheader("Preprocessing Movie Plots")
+st.markdown("""
+- Clean the 'plot' text by:
+  1. Lowercasing all words
+  2. Removing punctuation
+  3. Tokenizing into words
+  4. Removing stopwords
+  5. Lemmatizing words
+""")
+lemmatizer = WordNetLemmatizer()
+def preprocess_sentence(sentence):
+    if not isinstance(sentence, str):
+        return []
+    stop_words = set(stopwords.words('english'))
+    sentence = re.sub(r'[^\w\s]', '', sentence)
+    sentence = sentence.lower()
+    tokens = word_tokenize(sentence)
+    tokens = [word for word in tokens if word not in stop_words]
+    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return lemmatized_tokens
+movies_tag['plot'] = movies_tag['plot'].apply(preprocess_sentence)
+st.subheader("Cleaning Columns and Removing Duplicates")
+st.markdown("""
+- Drop unnecessary columns from the merged dataset.
+- Rename 'clean_title' to 'title'.
+- Reorder columns for clarity: 'movieId', 'title', 'year', 'tag', 'genres', 'plot'.
+""")
+movies_tag = movies_tag.drop(columns=['title_x', 'imdbId', 'title_y','bracket_title','imp_title'])
+movies_tag.rename(columns = {'clean_title':'title'}, inplace=True)
+# movies_tag.drop_duplicates(subset='title', keep='first', inplace=True)
+# movies_tag = movies_tag.sort_values(by=['title', 'year'], ascending=[True, False])
+# movies_tag = movies_tag.drop_duplicates(subset='title', keep='first')
+movies_tag = movies_tag[['movieId','title','year','tag','genres','plot']]
+st.markdown('---')
+# ============================== Displaying final movies dataset =====================================
+item_to_remove = '(no genres listed)'
+movies_tag['genres'] = movies_tag['genres'].apply(lambda x: [item for item in x if item != item_to_remove])
+st.subheader("🎬 Movies Dataset After Merging and Cleaning")
+st.dataframe(movies_tag)
+st.markdown(f"**Total unique movies after filtering:** {movies_tag['movieId'].nunique()}")
+# ================================================ removing duplicates =============================================================
+def check_empty_lists(row):
+    return isinstance(row['tag'], list) and len(row['tag']) == 0 and \
+           isinstance(row['genres'], list) and len(row['genres']) == 0 and \
+           isinstance(row['plot'], list) and len(row['plot']) == 0
+# Step 1: Count rows where all three columns ('tag', 'genres', 'plot') are empty lists
+empty_rows_count = movies_tag[movies_tag.apply(check_empty_lists, axis=1)].shape[0]
+st.markdown(f"**Number of rows where 'tag', 'genres', and 'plot' are empty lists:** {empty_rows_count}")
+# Step 2: Drop rows where all three columns are empty lists
+movies_tag_cleaned = movies_tag[~movies_tag.apply(check_empty_lists, axis=1)]
+#================ extracting the ratings dataset=============================
+movies_tag_cleaned = movies_tag_cleaned.drop_duplicates(subset=['title', 'year'], keep='first')
+# ratings = ratings[ratings['movieId'].isin(movies_tag_cleaned['movieId'])]
+# ratings = ratings.drop_duplicates()
+# ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='last')
+# ratings.to_csv(loc, index=False)
+# Correct way to save without creating Unnamed columns
+# del(ratings)
+st.subheader('Dropping the rows with empty Tag, Genres and Plot')
+# movies_tag_cleaned.to_csv('movies_final.csv', index=False)
+st.markdown(f"**Total unique movies after filtering:** {movies_tag_cleaned['movieId'].nunique()}")
+st.markdown('---')
+st.subheader("🎬 Final Movies Dataset")
+st.dataframe(movies_tag_cleaned)

collaborative_item.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+from pathlib import Path
+import sys
+import os
+import time
+from scipy.sparse import coo_matrix, csr_matrix
+from sklearn.decomposition import IncrementalPCA
+from sklearn.preprocessing import normalize
+import joblib
+# ====================== Streamlit Setup ======================
+st.set_page_config(layout="wide")
+st.title("🎬 Movie Recommender System Item-Based")
+# ====================== Paths ======================
+sys.path.append(os.path.dirname(__file__))
+data_dir = Path(__file__).parent / 'data'
+cache_dir = data_dir / 'cache'
+cache_dir.mkdir(exist_ok=True)
+movies_path = data_dir / 'movies.csv'
+ratings_path = data_dir / 'ratings.csv'
+# ====================== Load Data ======================
+@st.cache_data
+def load_data():
+    movies = pd.read_csv(movies_path)
+    ratings = st.session_state['ratings_df']
+    ratings = ratings.drop(columns=['timestamp'], errors='ignore')
+    ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
+    ratings['userId'] = ratings['userId'].astype(int)
+    ratings['movieId'] = ratings['movieId'].astype(int)
+    # Keep only movies present in movies.csv
+    ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
+    return movies, ratings
+movies, ratings = load_data()
+st.text(f"Initial ratings shape: {ratings.shape}")
+# ====================== Data Filtering ======================
+user_counts = ratings['userId'].value_counts()
+active_users = user_counts[user_counts >= 450].index
+ratings_filtered = ratings[ratings['userId'].isin(active_users)]
+movie_counts = ratings_filtered['movieId'].value_counts()
+popular_movies = movie_counts[movie_counts >= 450].index
+ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
+ratings = ratings_filtered.copy()
+del ratings_filtered
+st.text(f"Filtered ratings shape: {ratings.shape}")
+#===================summary=================================
+st.markdown("#### 🎯 Dataset Filtering Summary")
+st.markdown("""
+We filtered the dataset in **two steps** to focus on active users and popular movies:
+1. **Active Users**
+   - Kept users who have rated **at least 450 movies**.
+   - Ensures we focus on users who are actively engaged.
+2. **Popular Movies**
+   - Kept movies that have **at least 450 ratings**.
+   - Ensures we focus on movies with enough feedback to be meaningful.
+ **Result:**
+The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
+""")
+# ====================== Index Mapping ======================
+ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
+ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
+num_users = ratings['user_idx'].nunique()
+num_movies = ratings['movie_idx'].nunique()
+user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
+movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
+# ====================== Sparse Matrix ======================
+user_item_matrix = coo_matrix(
+    (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
+    shape=(num_users, num_movies)
+).tocsr()
+# ====================== Incremental SVD Caching ======================
+SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
+def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
+    if SVD_CACHE_PATH.exists():
+        # st.text("✅ Loading cached SVD decomposition...")
+        U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
+        return U, Sigma, VT
+    st.text("⚙️ Computing Incremental SVD... (may take a few minutes)")
+    ipca = IncrementalPCA(n_components=n_components)
+    n_samples = matrix.shape[0]
+    # Fit in chunks
+    for start_idx in range(0, n_samples, batch_size):
+        end_idx = min(start_idx + batch_size, n_samples)
+        batch = matrix[start_idx:end_idx].toarray()
+        ipca.partial_fit(batch)
+        st.text(f"Processed rows {start_idx} to {end_idx}")
+    # Transform full data
+    U = []
+    for start_idx in range(0, n_samples, batch_size):
+        end_idx = min(start_idx + batch_size, n_samples)
+        batch = matrix[start_idx:end_idx].toarray()
+        U.append(ipca.transform(batch))
+    U = np.vstack(U)
+    VT = ipca.components_
+    Sigma = np.ones(ipca.n_components_)
+    joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
+    st.text("✅ SVD computation cached successfully.")
+    return U, Sigma, VT
+U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
+U_normalized = normalize(U)
+# ====================== Recommendation Function ======================
+def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
+    start = time.time()
+    if user_id not in ratings['userId'].values:
+        return "User not found.", 0
+    user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
+    # Cosine similarity in latent space
+    user_vector = U_normalized[user_idx]
+    similarities = np.dot(U_normalized, user_vector)
+    # Top similar users
+    similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
+    similar_scores = similarities[similar_user_indices]
+    # Movies already rated
+    user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
+    user_rated_movie_set = set(user_rated_movie_indices)
+    # Aggregate weighted ratings
+    scores = {}
+    for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
+        sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
+        movie_indices = sim_user_ratings.nonzero()[1]
+        sim_ratings = sim_user_ratings.data
+        for m_idx, rating in zip(movie_indices, sim_ratings):
+            if m_idx not in user_rated_movie_set:
+                scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
+    # Sort and recommend
+    recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
+    recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
+    end = time.time()
+    recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title','genres']]
+    return recommendations, end - start
+# ====================== Streamlit UI ======================
+user_id_input = st.number_input(
+    "Enter User ID",
+    min_value=int(ratings['user_idx'].min()),
+    max_value=int(ratings['user_idx'].max()),
+    step=1
+)
+user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
+duration = 0
+if st.button("🎥 Recommend Movies"):
+    col1, col2 = st.columns(2)
+    with col1:
+        st.text('User Activity')
+        current_user = ratings[ratings['userId']==user_id_input]
+        merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
+        merge_results = merge_results[['title','rating']]
+        st.dataframe(merge_results)
+    with col2:
+        st.text('Recommended Movies')
+        recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
+        st.dataframe(recs['title'])
+    st.text(f"Computed in {duration:.2f} seconds.")
+# ====================== Explanation of CF ======================
+# st.markdown("""
+# ## 🤝 Collaborative Filtering Types
+# **User-Based CF**:
+# - Focus on users similar to you
+# - Recommend movies liked by similar users
+# **Item-Based CF**:
+# - Focus on movies similar to ones you liked
+# - Recommend similar movies
+# Example:
+# - You liked *Inception* and *Matrix*
+# - User-Based: "People like you also liked *Avengers*"
+# - Item-Based: "Since you liked *Inception*, try *Interstellar*"
+# """)

collaborative_user.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+from pathlib import Path
+import sys
+import os
+import time
+from scipy.sparse import coo_matrix, csr_matrix
+from sklearn.decomposition import TruncatedSVD, IncrementalPCA
+from sklearn.preprocessing import normalize
+from sklearn.metrics.pairwise import cosine_similarity
+import joblib
+# ====================== Streamlit Setup ======================
+st.set_page_config(layout="wide")
+st.title("🎬 Movie Recommender System User-Based")
+# ====================== Paths ======================
+sys.path.append(os.path.dirname(__file__))
+data_dir = Path(__file__).parent / 'data'
+cache_dir = data_dir / 'cache'
+cache_dir.mkdir(exist_ok=True)
+movies_path = 'movies_final.csv'
+ratings_path = data_dir / 'ratings.csv'
+# ====================== Load Data ======================
+# used to **cache the output of a function that loads or processes data** so that it doesn’t get recomputed every time the app reruns.
+# This is especially useful when loading large datasets or performing expensive computations.Since our data is huge
+@st.cache_data
+def load_data():
+    movies = pd.read_csv(movies_path)
+    ratings = st.session_state['ratings_df']
+    # ratings = ratings.drop(columns=['timestamp'], errors='ignore')
+    ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
+    ratings['userId'] = ratings['userId'].astype(int)
+    ratings['movieId'] = ratings['movieId'].astype(int)
+    return movies, ratings
+movies, ratings = load_data()
+ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique().tolist())]
+st.text(f"Initial ratings shape: {ratings.shape}")
+# ====================== Data Filtering ======================
+user_counts = ratings['userId'].value_counts()
+active_users = user_counts[user_counts >= 450].index
+ratings_filtered = ratings[ratings['userId'].isin(active_users)]
+movie_counts = ratings_filtered['movieId'].value_counts()
+popular_movies = movie_counts[movie_counts >= 450].index
+ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(popular_movies)]
+ratings = ratings_filtered.copy()
+del ratings_filtered
+st.text(f"Filtered ratings shape: {ratings.shape}")
+st.markdown("#### 🎯 Dataset Filtering Summary")
+st.markdown("""
+We filtered the dataset in **two steps** to focus on active users and popular movies:
+1. **Active Users**
+   - Kept users who have rated **at least 450 movies**.
+   - Ensures we focus on users who are actively engaged.
+2. **Popular Movies**
+   - Kept movies that have **at least 450 ratings**.
+   - Ensures we focus on movies with enough feedback to be meaningful.
+ **Result:**
+The final dataset contains ratings from active users on popular movies, improving the quality of recommendations and decreasing the size of the dataset.
+""")
+formatted_num = "{:,}".format(ratings.shape[0])
+print(formatted_num)
+# Optional: Show numbers dynamically
+st.markdown(f"**📊 Ratings after filtering:** {formatted_num} ratings from {ratings['userId'].nunique()} users on {ratings['movieId'].nunique()} movies.")
+# ====================== Index Mapping ======================
+# For each unique userId, assigns a unique integer code starting from 0.
+ratings['user_idx'] = ratings['userId'].astype('category').cat.codes
+ratings['movie_idx'] = ratings['movieId'].astype('category').cat.codes
+num_users = ratings['user_idx'].nunique()
+num_movies = ratings['movie_idx'].nunique()
+# Map indices back to original IDs
+# {0: 100, 1: 101, 2: 105}
+user_idx_to_id = dict(enumerate(ratings['userId'].astype('category').cat.categories))
+movie_idx_to_id = dict(enumerate(ratings['movieId'].astype('category').cat.categories))
+# ====================== Sparse Matrix ======================
+# Rows → users
+# Columns → movies
+# Values → ratings
+# Most entries are **0 (or missing)** because not all users rate all movies.
+user_item_matrix = coo_matrix(
+    (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
+    shape=(num_users, num_movies)
+).tocsr()
+# CSR format allows fast operations for matrix factorization or nearest-neighbor search
+# The full matrix would have 50 million cells. But maybe only 1 million ratings exist.
+# Dense matrix: stores all 50 million entries → huge memory usage.Sparse matrix: stores only the non-zero ratings → huge memory savings.
+# There are different sparse formats, e.g., COO, CSR, CSC.
+# ====================== Incremental SVD Caching ======================
+SVD_CACHE_PATH = cache_dir / "svd_incremental_cache.joblib"
+def compute_incremental_svd(matrix: csr_matrix, n_components=100, batch_size=1000):
+    """
+    Memory-efficient incremental SVD using IncrementalPCA.
+    Works even when matrix cannot fit entirely in memory.
+    """
+    if SVD_CACHE_PATH.exists():
+        # st.text("✅ Loading cached SVD decomposition...")
+        U, Sigma, VT = joblib.load(SVD_CACHE_PATH)
+        return U, Sigma, VT
+    st.text("⚙️ Computing Incremental SVD... (this may take several minutes)")
+    ipca = IncrementalPCA(n_components=n_components)
+    n_samples = matrix.shape[0]
+    # Fit in chunks
+    for start_idx in range(0, n_samples, batch_size):
+        end_idx = min(start_idx + batch_size, n_samples)
+        batch = matrix[start_idx:end_idx].toarray()
+        ipca.partial_fit(batch)
+        st.text(f"Processed rows {start_idx} to {end_idx}")
+    # Transform full data
+    U = []
+    for start_idx in range(0, n_samples, batch_size):
+        end_idx = min(start_idx + batch_size, n_samples)
+        batch = matrix[start_idx:end_idx].toarray()
+        U.append(ipca.transform(batch))
+    U = np.vstack(U)
+    VT = ipca.components_
+    Sigma = np.ones(ipca.n_components_)
+    joblib.dump((U, Sigma, VT), SVD_CACHE_PATH)
+    st.text("✅ SVD computation cached successfully.")
+    return U, Sigma, VT
+U, Sigma, VT = compute_incremental_svd(user_item_matrix, n_components=100, batch_size=100000)
+U_normalized = normalize(U)
+# ====================== Recommendation Function ======================
+def recommend_for_user_svd(user_id, top_n=5, n_similar_users=30):
+    start = time.time()
+    if user_id not in ratings['userId'].values:
+        return "User not found.", 0
+    user_idx = ratings.loc[ratings['userId'] == user_id, 'user_idx'].iloc[0]
+    # Cosine similarity in latent space
+    user_vector = U_normalized[user_idx]
+    similarities = np.dot(U_normalized, user_vector)
+    # Select top similar users
+    similar_user_indices = similarities.argsort()[::-1][1:n_similar_users+1]
+    similar_scores = similarities[similar_user_indices]
+    user_rated_movie_indices = user_item_matrix.getrow(user_idx).nonzero()[1]
+    user_rated_movie_set = set(user_rated_movie_indices)
+    # Aggregate weighted ratings
+    scores = {}
+    for sim_user_idx, sim_score in zip(similar_user_indices, similar_scores):
+        sim_user_ratings = user_item_matrix.getrow(sim_user_idx)
+        movie_indices = sim_user_ratings.nonzero()[1]
+        sim_ratings = sim_user_ratings.data
+        for m_idx, rating in zip(movie_indices, sim_ratings):
+            if m_idx not in user_rated_movie_set:
+                scores[m_idx] = scores.get(m_idx, 0) + sim_score * rating
+    # Sort and recommend
+    recommended_movie_indices = sorted(scores, key=scores.get, reverse=True)[:top_n]
+    recommended_movie_ids = [movie_idx_to_id[idx] for idx in recommended_movie_indices]
+    end = time.time()
+    recommendations = movies[movies['movieId'].isin(recommended_movie_ids)][['title']]
+    return recommendations, end - start
+# 1. Works in latent space → captures hidden user preferences.
+# 2. Uses top similar users → collaborative filtering logic.
+# 3. Avoids recommending already rated movies.
+# 4. Weighted aggregation ensures more similar users influence recommendations more.
+# 5. Returns top N movies along with computation time.
+# ====================== Streamlit UI ===========================================
+# st.dataframe(ratings.head(1000))
+# st.text('unique')
+# st.text(ratings['user_idx'].nunique())
+# st.text(ratings['userId'].nunique())
+user_id_input = st.number_input(
+    "Enter User ID",
+    min_value=int(ratings['user_idx'].min()),
+    max_value=int(ratings['user_idx'].max()),
+    step=1
+)
+user_id_input = ratings[ratings['user_idx'] == user_id_input]['userId'].unique()[0]
+# st.dataframe(ratings)
+# st.text(user_id_input)
+if st.button("🎥 Recommend Movies"):
+    col1, col2 = st.columns(2)
+    with col1:
+        st.text('User Activity')
+        current_user = ratings[ratings['userId']==user_id_input]
+        merge_results = pd.merge(current_user,movies, how='left', on = 'movieId')
+        merge_results = merge_results[['title','rating']]
+        st.dataframe(merge_results)
+    with col2:
+        st.text('Recommended Movies')
+        recs, duration = recommend_for_user_svd(user_id_input, top_n=5)
+        st.dataframe(recs)
+    st.text(f"Computed in {duration:.2f} seconds.")

content_based.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import numpy as np
+import streamlit as st
+from gensim.models import Word2Vec
+# ====================== Streamlit Setup ======================
+st.set_page_config(layout="wide")
+st.title("Content Based Filtering")
+st.markdown('---')
+# ======================== Load Data ==========================
+movies = pd.read_csv('movies_final.csv')
+movies = movies.drop(columns=['movieId'])
+# ======================== Helper Function ====================
+def combine_tokens(row, cols):
+    combined = []
+    for col in cols:
+        val = row[col]
+        if isinstance(val, list):
+            combined.extend(val)
+        elif isinstance(val, str):
+            combined.append(val)
+    return list(set(combined))
+# ======================== BOW ================================
+def recommend_bow(title, year, top_n=5):
+    if title not in movies['title'].values:
+        return "Movie not found."
+    # Get index of the queried movie
+    idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
+    # Compute cosine similarity scores
+    query_vector = bow_matrix[idx]
+    sim_scores = cosine_similarity(query_vector, bow_matrix).flatten()
+    # Get indices of similar movies (excluding the query itself)
+    similar_indices = sim_scores.argsort()[::-1][1:top_n + 1]
+    # Create a DataFrame for top recommendations with similarity scores
+    recommendations = pd.DataFrame({
+        'title': movies.iloc[similar_indices]['title'].values,
+        'year': movies.iloc[similar_indices]['year'].values,
+        'similarity_score': sim_scores[similar_indices]
+    })
+    # Sort for clarity (highest similarity first)
+    recommendations = recommendations.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
+    return recommendations
+# ======================== TF-IDF ==============================
+def recommend_tfidf(title, year, top_n=5):
+    if title not in movies['title'].values:
+        return "Movie not found."
+    idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
+    query_vector = tfidf_matrix[idx]
+    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
+    # Get the indices of similar movies (excluding the movie itself)
+    similar_indices = sim_scores.argsort()[::-1][1:]
+    # Ensure you don’t get more than `top_n` recommendations
+    similar_indices = similar_indices[:top_n]
+    # Get the movie titles
+    top_movies = movies.iloc[similar_indices]['title'].tolist()
+    return top_movies
+# ======================== WORD2VEC ============================
+def build_word2vec_vectors():
+    tokenized_corpus = [text.split() for text in movies['tokens']]
+    # Train Word2Vec model
+    w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
+    # Function to compute average vector for each movie
+    def get_avg_vector(tokens):
+        vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
+        if len(vectors) == 0:
+            return np.zeros(w2v_model.vector_size)
+        return np.mean(vectors, axis=0)
+    # Compute average vector for each movie
+    movies['w2v_vector'] = [get_avg_vector(text.split()) for text in movies['tokens']]
+    w2v_matrix = np.vstack(movies['w2v_vector'].values)
+    return w2v_model, w2v_matrix
+def recommend_w2v(title, year, w2v_matrix, top_n=5):
+    if title not in movies['title'].values:
+        return "Movie not found."
+    idx = movies[(movies['title'] == title) & (movies['year'] == year)].index[0]
+    query_vec = w2v_matrix[idx].reshape(1, -1)
+    sims = cosine_similarity(query_vec, w2v_matrix).flatten()
+    # Get the indices of similar movies (excluding the movie itself)
+    similar_indices = sims.argsort()[::-1][1:]
+    # Ensure you don’t get more than `top_n` recommendations
+    similar_indices = similar_indices[:top_n]
+    # Get the movie titles
+    top_movies = movies.iloc[similar_indices]['title'].tolist()
+    return top_movies
+# ======================== UI ========================
+st.header('🎬 Choose an Algorithm and Movie for Content-Based Filtering')
+select_algo = st.selectbox('Select the algorithm', ['Bag of Words', 'TF-IDF', 'Word2Vec'])
+selected_movie = st.selectbox('Select a movie', movies['title'].unique())
+selected_year = st.selectbox('Select the year', movies[movies['title']==selected_movie]['year'])
+all_columns = movies.columns.tolist()
+selected_cols = st.multiselect("Select columns to combine for tokens", all_columns, default=['genres', 'tag', 'plot'])
+# Combine selected columns into tokens
+movies['tokens'] = movies.apply(lambda row: combine_tokens(row, selected_cols), axis=1)
+movies['tokens'] = movies['tokens'].apply(lambda x: ' '.join(x))
+if st.button('Recommend'):
+    # Display the selected movie and its columns
+    st.dataframe(movies[(movies['title'] == selected_movie) & (movies['year'] == selected_year)][['title'] + selected_cols])
+    # Recommendation based on the selected algorithm
+    if select_algo == 'Bag of Words':
+        with st.spinner("Building Bag of Words model..."):
+            vectorizer = CountVectorizer()
+            bow_matrix = vectorizer.fit_transform(movies['tokens'])
+            output = recommend_bow(selected_movie, selected_year, top_n=5)
+        st.success("Bag of Words model ready ✅")
+        output_display = pd.merge( output, movies[['title', 'year'] + selected_cols], on=['title', 'year'], how='left' ).reset_index(drop=True)
+        st.subheader("🎯 Top Recommendations (Bag of Words)")
+        st.dataframe(output_display[['title', 'year', 'similarity_score'] + selected_cols])
+    elif select_algo == 'TF-IDF':
+        with st.spinner("Building TF-IDF model..."):
+            vectorizer = TfidfVectorizer()
+            tfidf_matrix = vectorizer.fit_transform(movies['tokens'])
+            output = recommend_tfidf(selected_movie, selected_year, top_n=5)
+        st.success("TF-IDF model ready ✅")
+        st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))
+    elif select_algo == 'Word2Vec':
+        with st.spinner("Training Word2Vec model..."):
+            w2v_model, w2v_matrix = build_word2vec_vectors()
+            output = recommend_w2v(selected_movie, selected_year, w2v_matrix, top_n=5)
+        st.success("Word2Vec model ready ✅")
+        st.dataframe(movies[movies['title'].isin(output)][['title'] + selected_cols].reset_index(drop=True))

data/cache/svd_incremental_cache.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58176c8e3545d4ade2701a7fdeca9d1495b864fa2a77bbc5db4272dfacb01eda
+size 13812355

data/grouped_tags.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9269b5d520d4063ae57c3027ce0402dc755927ac11db46ea835c2321f6f071e
+size 31065818

data/links.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/merger.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d566674daf655758a3c7d01e1ec31ff6a63064d8f1b6fa67f693e2badd8347e
+size 21817265

data/movies.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/ratings.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0ab9d46f7ba3cdb7a00602f608491bcae6d479f1c5649d291d07eccb0289870
+size 529039466

data/tags.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e4b043d9088ce7bdad9f4ba06901330f3c4adec6e448da83d9a5d32fc756d55
+size 25847889

item_based.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from sentence_transformers import SentenceTransformer
+import sys
+import os
+import pandas as pd
+from pathlib import Path
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+import streamlit as st
+import time
+# sys.path.append(os.path.dirname(__file__))
+# data_dir = Path(__file__).parent.parent / 'data'
+# # links dataset
+# movies = data_dir / 'movies_final.csv'
+movies = pd.read_csv('movies_final.csv')
+st.text(movies.shape)
+st.dataframe(movies.head())
+start = time.time()
+model = SentenceTransformer('all-MiniLM-L6-v2')
+movies['bert_vector'] = model.encode(movies['tokens'].tolist(), show_progress_bar=True).tolist()
+def recommend_bert(title, top_n=5):
+    if title not in movies['title'].values:
+        return "Movie not found."
+    idx = movies[movies['title'] == title].index[0]
+    query_vector = np.array(movies.loc[idx, 'bert_vector']).reshape(1, -1)
+    all_vectors = np.vstack(movies['bert_vector'].values)
+    similarities = cosine_similarity(query_vector, all_vectors)[0]
+    sim_scores = list(enumerate(similarities))
+    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+    # Skip the first result (it’s the movie itself)
+    recommended = [movies.iloc[i]['title'] for i, score in sim_scores[1:top_n+1]]
+    return recommended
+out = recommend_bert('Schindler\'s List')
+st.text(out)
+end = time.time()
+st.text(end-start)

main.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import streamlit as st
+# Define pages (by file or function)
+page1 = st.Page("about_me.py", title="About Me", icon="👤")
+page2 = st.Page("base.py", title="Home Page", icon="🏠")
+page3 = st.Page("popularity.py", title="Popularity Based Filtering", icon="🔥")
+page4 = st.Page("content_based.py", title="Content Based Filtering", icon="🎬")
+page5 = st.Page("collaborative_user.py", title="Collaborative User based", icon="👥")
+page6 = st.Page("collaborative_item.py", title="Collaborative Item based", icon="🛍️")
+# Create navigation
+pg = st.navigation([page1, page2, page3, page4, page5, page6])
+# Run navigation
+pg.run()

movies_final.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77a03ffbfcc887eef7e34b750ec446233b09b2e70471eae138bbb094c0c09cda
+size 24885151

nltk_data/corpora/stopwords.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:174365e185ab7b343a4ca41ac9d1b44f03c928a226eb720afb027882865e07c0
+size 823

nltk_data/corpora/stopwords/english ADDED Viewed

	@@ -0,0 +1,198 @@

+a
+about
+above
+after
+again
+against
+ain
+all
+am
+an
+and
+any
+are
+aren
+aren't
+as
+at
+be
+because
+been
+before
+being
+below
+between
+both
+but
+by
+can
+couldn
+couldn't
+d
+did
+didn
+didn't
+do
+does
+doesn
+doesn't
+doing
+don
+don't
+down
+during
+each
+few
+for
+from
+further
+had
+hadn
+hadn't
+has
+hasn
+hasn't
+have
+haven
+haven't
+having
+he
+he'd
+he'll
+her
+here
+hers
+herself
+he's
+him
+himself
+his
+how
+i
+i'd
+if
+i'll
+i'm
+in
+into
+is
+isn
+isn't
+it
+it'd
+it'll
+it's
+its
+itself
+i've
+just
+ll
+m
+ma
+me
+mightn
+mightn't
+more
+most
+mustn
+mustn't
+my
+myself
+needn
+needn't
+no
+nor
+not
+now
+o
+of
+off
+on
+once
+only
+or
+other
+our
+ours
+ourselves
+out
+over
+own
+re
+s
+same
+shan
+shan't
+she
+she'd
+she'll
+she's
+should
+shouldn
+shouldn't
+should've
+so
+some
+such
+t
+than
+that
+that'll
+the
+their
+theirs
+them
+themselves
+then
+there
+these
+they
+they'd
+they'll
+they're
+they've
+this
+those
+through
+to
+too
+under
+until
+up
+ve
+very
+was
+wasn
+wasn't
+we
+we'd
+we'll
+we're
+were
+weren
+weren't
+we've
+what
+when
+where
+which
+while
+who
+whom
+why
+will
+with
+won
+won't
+wouldn
+wouldn't
+y
+you
+you'd
+you'll
+your
+you're
+yours
+yourself
+yourselves
+you've

nltk_data/corpora/wordnet.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbda5ea6eef7f36a97a43d4a75f85e07fccbb4f23657d27b4ccbc93e2646ab59
+size 10775600

nltk_data/tokenizers/punkt.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
+size 13905355

nltk_data/tokenizers/punkt/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

nltk_data/tokenizers/punkt/PY3/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/PY3/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
+size 406697

nltk_data/tokenizers/punkt/README ADDED Viewed

	@@ -0,0 +1,98 @@

+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+There are pretrained tokenizers for the following languages:
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+---- Training Code ----
+# import punkt
+import nltk.tokenize.punkt
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+# Train tokenizer
+tokenizer.train(text)
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+---------

nltk_data/tokenizers/punkt/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
+size 433305

popularity.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+from pathlib import Path
+import sys
+import os
+import time
+import matplotlib.pyplot as plt
+# ====================== Streamlit Setup ======================
+st.set_page_config(layout="wide")
+st.title("Popularity Based Filtering")
+st.markdown('---')
+# ====================== Data Extraction ======================
+sys.path.append(os.path.dirname(__file__))
+data_dir = Path(__file__).parent / 'data'
+movies = 'movies_final.csv'
+movies = pd.read_csv(movies)
+movies = movies[['movieId','title']]
+# ratings dataset
+ratings = data_dir / 'ratings.csv'
+ratings = st.session_state['ratings_df']
+# ratings = ratings.drop(columns=['timestamp'])
+ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='first')
+st.subheader('👀 Glance of the dataset')
+col1 ,col2 = st.columns(2)
+with col1:
+    st.markdown("Ratings Dataset")
+    st.dataframe(ratings.head())
+with col2:
+    st.markdown("Movies Dataset")
+    st.dataframe(movies.head())
+st.markdown('---')
+# ============================= Most Rated Movies ====================================================
+col1, col2 = st.columns(2)
+most_rated = ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
+most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
+most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+movie_ids = most_rated_final['movieId'].tolist()
+avg_ratings = ratings[ratings['movieId'].isin(movie_ids)]
+avg_ratings = avg_ratings.groupby('movieId')['rating'].mean().reset_index().rename(columns={'rating': 'Avg_Rating'})
+most_rated_final = pd.merge(most_rated_final, avg_ratings, how='left', on='movieId')
+with col1:
+    st.subheader('📈 Top 5 Most Rated Movies')
+    st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
+# ============================= Least Rated Movies ====================================================
+most_rated = ratings.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).rename(
+    columns={'rating': 'Avg_Rating', 'userId': 'No_of_Ratings'}
+)
+most_rated = most_rated.sort_values(['No_of_Ratings','Avg_Rating']).head()
+most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+with col2:
+    st.subheader('📉 Top 5 Least Rated Movies')
+    st.dataframe(most_rated_final[['title', 'No_of_Ratings', 'Avg_Rating']])
+# ============================= Highest Average Rated Movies ==========================================
+st.markdown('---')
+col1, col2 = st.columns(2)
+filter_ratings = ratings[ratings['rating'] == 5]
+most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
+most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
+most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+filter_ratings = ratings[ratings['rating'] == 0.5]
+most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
+most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
+least_most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+with col1:
+    st.subheader('Highest Rated Movies with 5⭐')
+    st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
+    st.subheader('Highest Rated Movies with 0.5⭐')
+    st.dataframe(least_most_rated_final[['title', 'No_of_Ratings']])
+# ============================= Ratings Distribution Graph ============================================
+import matplotlib.pyplot as plt
+# col1, col2 = st.columns(2)
+with col2:
+    st.subheader('📊 Ratings Distribution')
+    graph = ratings.groupby('rating').count()['userId']
+    fig, ax = plt.subplots()
+    ax.bar(graph.index, graph.values)
+    ax.set_xlabel('Rating')
+    ax.set_ylabel('Number of Ratings')
+    ax.set_title('Number of Ratings by Rating Value')
+    st.pyplot(fig)
+st.markdown('---')
+# ============================= User Selected Ratings ===================================================
+st.subheader('🎯 Movies Filtered by Selected Rating')
+ratings_selected = st.selectbox(
+    'Select the Rating',
+    pd.Series(ratings['rating'].unique()).sort_values(ascending=False).tolist()
+)
+filter_ratings = ratings[ratings['rating'] == ratings_selected]
+col1, col2 = st.columns(2)
+with col1:
+    st.markdown('**Most Rated for Selected Rating**')
+    most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values(ascending=False).head()
+    most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
+    most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+    st.dataframe(most_rated_final[['title', 'No_of_Ratings']])
+with col2:
+    st.markdown('**Least Rated for Selected Rating**')
+    most_rated = filter_ratings.groupby('movieId').count()['userId'].sort_values().head()
+    most_rated = most_rated.reset_index().rename(columns={'userId': 'No_of_Ratings'})
+    most_rated_final = pd.merge(most_rated, movies, how='left', on='movieId')
+    st.dataframe(most_rated_final[['title', 'No_of_Ratings']])

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+joblib==1.4.2
+matplotlib==3.10.7
+nltk==3.9.2
+pandas==2.3.3
+scikit_learn==1.7.2
+sentence_transformers==5.1.1
+streamlit==1.50.0
+gensim==4.3.3
+numpy==1.23.5
+scipy==1.13.0