croeasusking commited on
Commit
7839fd1
·
verified ·
1 Parent(s): 642f26f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -24
app.py CHANGED
@@ -4,48 +4,27 @@ from datetime import datetime
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
- # Load dataset (ensure the file is in the same directory)
8
  df = pd.read_csv("analytics_vidhya_articles.csv", parse_dates=["Date"])
9
 
10
-
11
  df['Date'] = pd.to_datetime(df['Date'])
12
- # Combine Title and Description for similarity search
13
  df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str)
14
 
15
- # Load sentence transformer model
16
  model = SentenceTransformer("all-MiniLM-L6-v2")
17
 
18
  # Function to retrieve top-N records
19
  def retrieve_records(query, top_n):
20
- # Filter by date
21
- # filtered_df = df[df["Date"] >= pd.to_datetime(min_date)]
22
-
23
- # if filtered_df.empty or not query.strip():
24
- # return pd.DataFrame(columns=["Title", "Description", "Date", "Link"])
25
-
26
- # Compute embeddings
27
  text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
28
  query_embedding = model.encode([query], convert_to_tensor=False)
29
 
30
- # Compute cosine similarity
31
- # scores = cosine_similarity([query_embedding], text_embeddings)[0]
32
- # filtered_df = filtered_df.copy()
33
- # filtered_df["similarity"] = scores
34
-
35
- # # Return top-N results
36
- # top_results = filtered_df.sort_values(by="similarity", ascending=False).head(top_n)
37
- # return top_results[["Title", "Description", "Date", "Link"]]
38
-
39
  scores = cosine_similarity(query_embedding, text_embeddings).flatten()
40
- # filtered_df = filtered_df.copy()
41
- # filtered_df["similarity"] = scores
42
  df["similarity"] = scores
43
 
44
- # Return top-N results
45
  top_results = df.sort_values(by=['similarity', 'Date'], ascending=[False, False]).head(top_n)
46
  return top_results[["Title", "Description", "Date", "Link", 'similarity']]
47
 
48
-
49
 
50
  # Gradio interface
51
  demo = gr.Interface(
 
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
+ # Load dataset
8
  df = pd.read_csv("analytics_vidhya_articles.csv", parse_dates=["Date"])
9
 
10
+ # Preprocessing
11
  df['Date'] = pd.to_datetime(df['Date'])
 
12
  df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str)
13
 
14
+ # Load model
15
  model = SentenceTransformer("all-MiniLM-L6-v2")
16
 
17
  # Function to retrieve top-N records
18
  def retrieve_records(query, top_n):
 
 
 
 
 
 
 
19
  text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
20
  query_embedding = model.encode([query], convert_to_tensor=False)
21
 
 
 
 
 
 
 
 
 
 
22
  scores = cosine_similarity(query_embedding, text_embeddings).flatten()
 
 
23
  df["similarity"] = scores
24
 
 
25
  top_results = df.sort_values(by=['similarity', 'Date'], ascending=[False, False]).head(top_n)
26
  return top_results[["Title", "Description", "Date", "Link", 'similarity']]
27
 
 
28
 
29
  # Gradio interface
30
  demo = gr.Interface(