File size: 1,748 Bytes
62f5f82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer

# 讟注谉 讗转 讛诪讜讚诇
model = SentenceTransformer("all-MiniLM-L6-v2")

# 讟注谉 讗转 讛讚讗讟讛住讟 诪讛诇讬谞拽
url = "https://huggingface.co/datasets/Pablinho/movies-dataset/resolve/main/9000plus.csv"
print("Loading dataset...")
dataset = pd.read_csv(url)

# 讜讚讗 砖讛注诪讜讚讜转 拽讬讬诪讜转
assert "Title" in dataset.columns
assert "Overview" in dataset.columns

# 谞拽讛 砖讜专讜转 注诐 Overview 讞住专 讗讜 诇讗 诪讞专讜讝转
dataset = dataset.dropna(subset=["Overview"])
dataset = dataset[dataset["Overview"].apply(lambda x: isinstance(x, str))]

# 讛讙讘诇 诇志500 住专讟讬诐
MAX_MOVIES = 500
dataset = dataset.head(MAX_MOVIES)

print(f"Encoding {len(dataset)} movie descriptions...")
dataset["embeddings"] = dataset["Overview"].apply(lambda x: model.encode(x).tolist())
print("Done encoding!")

def recommend_similar_movies(input_text, top_n=5):
    input_embedding = model.encode([input_text])
    similarities = cosine_similarity(input_embedding, np.vstack(dataset['embeddings'].to_numpy()))[0]
    top_indices = similarities.argsort()[::-1][:top_n]
    results = dataset.iloc[top_indices][['Title', 'Overview']]
    return "\n\n".join(f"馃幀 **{row['Title']}**\n{row['Overview']}" for _, row in results.iterrows())

demo = gr.Interface(
    fn=recommend_similar_movies,
    inputs=gr.Textbox(lines=2, placeholder="Describe a movie..."),
    outputs="text",
    title="Movie Recommender",
    description="Get movie recommendations based on your description. Powered by sentence-transformers and cosine similarity."
)

if __name__ == "__main__":
    demo.launch()