Viper51 commited on
Commit
2d5b093
·
verified ·
1 Parent(s): f0c96fb

Initial commit

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +95 -0
  3. requirements.txt +5 -0
  4. tmdb_5000_credits.csv +3 -0
  5. tmdb_5000_movies.csv +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tmdb_5000_credits.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import ast
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from nltk.stem.porter import PorterStemmer
7
+ import gradio as gr
8
+ import nltk
9
+ nltk.download('punkt')
10
+
11
+ # Load data
12
+ movies = pd.read_csv('tmdb_5000_movies.csv')
13
+ credits = pd.read_csv('tmdb_5000_credits.csv')
14
+ movies = movies.merge(credits, on='title')
15
+ movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
16
+ movies.dropna(inplace=True)
17
+
18
+
19
+ # Process genres, keywords
20
+ def convert(obj):
21
+ return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)]
22
+
23
+
24
+ movies['genres'] = movies['genres'].apply(convert)
25
+ movies['keywords'] = movies['keywords'].apply(convert)
26
+
27
+
28
+ # Top 3 cast
29
+ def convert3(obj):
30
+ return [i['name'].replace(" ", "") for i in ast.literal_eval(obj)[:3]]
31
+
32
+
33
+ movies['cast'] = movies['cast'].apply(convert3)
34
+
35
+
36
+ # Director
37
+ def fetch_director(obj):
38
+ for i in ast.literal_eval(obj):
39
+ if i['job'] == 'Director':
40
+ return [i['name'].replace(" ", "")]
41
+ return []
42
+
43
+
44
+ movies['crew'] = movies['crew'].apply(fetch_director)
45
+
46
+ # Overview processing
47
+ movies['overview'] = movies['overview'].apply(lambda x: x.split())
48
+
49
+ # Create tags
50
+ movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
51
+ new_df = movies[['movie_id', 'title', 'tags']]
52
+ new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())
53
+
54
+ # Stemming
55
+ ps = PorterStemmer()
56
+
57
+
58
+ def stem(text):
59
+ return " ".join([ps.stem(word) for word in text.split()])
60
+
61
+
62
+ new_df['tags'] = new_df['tags'].apply(stem)
63
+
64
+ # Vectorization
65
+ cv = CountVectorizer(max_features=5000, stop_words='english')
66
+ vectors = cv.fit_transform(new_df['tags']).toarray()
67
+
68
+ # Similarity
69
+ similarity = cosine_similarity(vectors)
70
+
71
+
72
+ # Recommendation function
73
+ def recommend(movie):
74
+ movie = movie.lower()
75
+ if movie not in new_df['title'].str.lower().values:
76
+ return ["Movie not found in database :( "]
77
+
78
+ index = new_df[new_df['title'].str.lower() == movie].index[0]
79
+ distances = similarity[index]
80
+ movie_list = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])[1:6]
81
+ return [new_df.iloc[i[0]].title for i in movie_list]
82
+
83
+
84
+ # Gradio interface
85
+ def recommend_interface(movie_name):
86
+ return recommend(movie_name)
87
+
88
+
89
+ demo = gr.Interface(fn=recommend_interface,
90
+ inputs=gr.Textbox(lines=1, placeholder="Enter a movie name..."),
91
+ outputs=gr.List(label="Top 5 Recommendations"),
92
+ title="Movie Recommender")
93
+
94
+ if __name__ == "__main__":
95
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ numpy
4
+ nltk
5
+ scikit-learn
tmdb_5000_credits.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0050599ff88d40366c4841204b1489862bca346bfa46c20b05a65d14508435
3
+ size 40044293
tmdb_5000_movies.csv ADDED
The diff for this file is too large to render. See raw diff