rijdev commited on
Commit
ecf1596
·
verified ·
1 Parent(s): 82fe775
Files changed (1) hide show
  1. app.py +20 -31
app.py CHANGED
@@ -2,45 +2,34 @@ import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
4
 
5
- # 1) Load movie metadata and ratings from GroupLens URLs
6
- movies = load_dataset(
7
- "csv",
8
- data_files="https://files.grouplens.org/datasets/movielens/ml-latest-small/movies.csv",
9
- split="train",
10
- )
11
- ratings = load_dataset(
12
- "csv",
13
- data_files="https://files.grouplens.org/datasets/movielens/ml-latest-small/ratings.csv",
14
- split="train",
15
- )
16
 
17
  # 2) Convert to pandas
18
- movies_df = movies.to_pandas()
19
- ratings_df = ratings.to_pandas()
20
-
21
- # 3) Infer a release year per movie by taking the earliest rating timestamp
22
- # (assuming users start rating soon after release)
23
- ratings_df["year"] = pd.to_datetime(ratings_df["timestamp"], unit="s").dt.year
24
- first_year = ratings_df.groupby("movieId")["year"].min().reset_index()
25
-
26
- # 4) Merge metadata + inferred year, drop duplicates
27
- metadata = (
28
- movies_df.merge(first_year, on="movieId", how="inner")
29
- .rename(columns={"year": "release_year"})
30
- .drop_duplicates(subset=["movieId"])
31
  )
32
 
 
 
 
33
  def recommend_by_genre_year(genre: str, year: int, top_k: int = 5) -> str:
34
- # filter by genre substring (case-insensitive) and release_year ≥ year
35
  mask_genre = metadata["genres"].str.lower().str.contains(genre.lower())
36
- mask_year = metadata["release_year"] >= year
37
- candidates = metadata[mask_genre & mask_year]
38
 
 
39
  if candidates.empty:
40
- return f"No {genre.title()} movies found from {year} onward."
41
 
42
  picks = candidates.sample(n=min(top_k, len(candidates)))
43
- return "\n".join(f"• {row.title} ({int(row.release_year)})"
44
  for _, row in picks.iterrows())
45
 
46
  iface = gr.Interface(
@@ -53,8 +42,8 @@ iface = gr.Interface(
53
  outputs="text",
54
  title="🎬 Genre & Year-Based Movie Recommender",
55
  description="""
56
- Pulls MovieLens “ml-latest-small metadata & ratings live from GroupLens
57
- to filter by genre and release year (inferred). No local files needed.
58
  """,
59
  )
60
 
 
2
  import pandas as pd
3
  from datasets import load_dataset
4
 
5
+ # 1) Load MovieLens metadata (small split, ~1.5K movies)
6
+ ds = load_dataset("bstds/movielens", "small", split="train")
 
 
 
 
 
 
 
 
 
7
 
8
  # 2) Convert to pandas
9
+ df = ds.to_pandas()
10
+
11
+ # 3) Extract release year from title, e.g. "Movie Title (1999)" → 1999
12
+ df["release_year"] = (
13
+ df["title"]
14
+ .str.extract(r"\((\d{4})\)") # capture 4-digit year
15
+ .astype(pd.Int64Dtype()) # allow missing values
 
 
 
 
 
 
16
  )
17
 
18
+ # 4) Deduplicate metadata
19
+ metadata = df[["title", "genres", "release_year"]].drop_duplicates()
20
+
21
  def recommend_by_genre_year(genre: str, year: int, top_k: int = 5) -> str:
22
+ # filter genre (case-insensitive substring)
23
  mask_genre = metadata["genres"].str.lower().str.contains(genre.lower())
24
+ # filter release year ≥ input year (drop rows missing year)
25
+ mask_year = metadata["release_year"].fillna(0) >= year
26
 
27
+ candidates = metadata[mask_genre & mask_year]
28
  if candidates.empty:
29
+ return f"No '{genre.title()}' movies found from {year} onward."
30
 
31
  picks = candidates.sample(n=min(top_k, len(candidates)))
32
+ return "\n".join(f"• {row.title} ({int(row.release_year) if pd.notna(row.release_year) else 'Year N/A'})"
33
  for _, row in picks.iterrows())
34
 
35
  iface = gr.Interface(
 
42
  outputs="text",
43
  title="🎬 Genre & Year-Based Movie Recommender",
44
  description="""
45
+ Loads MovieLens metadata (small split) from the Hub, extracts release years from titles,
46
+ and filters by genre substring & year. No local files needed.
47
  """,
48
  )
49