croeasusking commited on
Commit
0f5de25
·
verified ·
1 Parent(s): 8a8348b

Update original.py

Browse files
Files changed (1) hide show
  1. original.py +51 -30
original.py CHANGED
@@ -3,6 +3,8 @@ import gradio as gr
3
  from datetime import datetime
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
 
 
6
 
7
  # Load dataset
8
  df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"])
@@ -11,46 +13,51 @@ df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"])
11
  df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')
12
  df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str) + " " + df["Content"].astype(str)
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Load model
15
  model = SentenceTransformer("all-MiniLM-L6-v2")
16
 
17
  text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
18
 
19
 
20
- # Function to retrieve top-N records
21
- # def retrieve_records(query, top_n):
22
- # text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
23
- # query_embedding = model.encode([query], convert_to_tensor=False)
24
-
25
- # scores = cosine_similarity(query_embedding, text_embeddings).flatten()
26
- # df["similarity"] = scores
27
-
28
- # top_results = df.sort_values(by=['similarity', 'Date'], ascending=[False, False]).head(top_n)
29
- # return top_results[["Title", "Description", "Date", "Link", 'similarity']]
30
-
31
- # Gradio interface
32
- # iface = gr.Interface(
33
- # fn=retrieve_records,
34
- # inputs=[
35
- # gr.Textbox(label="Enter your query"),
36
- # # gr.Textbox(label="Minimum date (YYYY-MM-DD)", value=str(datetime.today().date())),
37
- # gr.Slider(5, 20,step=1, label="Top N results")
38
- # ],
39
- # outputs=gr.Dataframe(label="Top Similar Records"),
40
- # title="Top-N Article Retriever",
41
- # description="Search articles using Title and Description similarity, filtered by a minimum date."
42
- # )
43
 
44
 
 
 
 
 
 
 
 
45
 
46
- def retrieve_records(query, top_n):
47
- query_embedding = model.encode([query], convert_to_tensor=False)
 
 
 
 
 
 
48
 
 
49
  scores = cosine_similarity(query_embedding, text_embeddings).flatten()
50
  df["similarity"] = scores
 
 
 
 
51
 
52
- top_results = df.sort_values(by=['similarity', 'Date'], ascending=[False, False]).head(top_n)
53
-
54
  markdown_output = ""
55
  for _, row in top_results.iterrows():
56
  markdown_output += f"### [{row['Title']}]({row['Link']})\n"
@@ -60,17 +67,31 @@ def retrieve_records(query, top_n):
60
  return markdown_output
61
 
62
 
 
 
 
 
63
  iface = gr.Interface(
64
  fn=retrieve_records,
65
  inputs=[
66
- gr.Textbox(label="Enter your query"),
67
- gr.Slider(5, 15, step=5, label="Top N results")
68
  ],
69
  outputs=gr.Markdown(label="Top Similar Records"),
70
- title="Top-N Article Retriever with Clickable Links"
71
  )
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
  iface.launch()
 
3
  from datetime import datetime
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ import re
7
+
8
 
9
  # Load dataset
10
  df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"])
 
13
  df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')
14
  df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str) + " " + df["Content"].astype(str)
15
 
16
+ # Load query CSV with columns: Topic, Subtopic, TopN
17
+ query_df = pd.read_csv("query.csv")
18
+ query_df.dropna(subset=["Topic", "Subtopic", "TopN"], inplace=True)
19
+
20
+ # Build dropdown options
21
+ query_df["QueryOption"] = query_df.apply(
22
+ lambda row: f"{row['Topic']} - {row['Subtopic']} (TopN: {int(row['TopN'])})", axis=1
23
+ )
24
+
25
+ query_options = query_df["QueryOption"].tolist()
26
+
27
  # Load model
28
  model = SentenceTransformer("all-MiniLM-L6-v2")
29
 
30
  text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
+ def retrieve_records(selected_query):
36
+ # query_embedding = model.encode([query], convert_to_tensor=False)
37
+
38
+ # scores = cosine_similarity(query_embedding, text_embeddings).flatten()
39
+ # df["similarity"] = scores
40
+
41
+ # top_results = df.sort_values(by=['similarity', 'Date'], ascending=[False, False]).head(top_n)
42
 
43
+ # Extract Topic, Subtopic, and TopN from dropdown text
44
+ match = re.match(r"(.+?) - (.+?) \(TopN: (\d+)\)", selected_query)
45
+ if not match:
46
+ return "Invalid query format selected."
47
+
48
+ topic, subtopic, top_n = match.groups()
49
+ top_n = int(top_n)
50
+ full_query = f"{topic} {subtopic}"
51
 
52
+ query_embedding = model.encode([full_query], convert_to_tensor=False)
53
  scores = cosine_similarity(query_embedding, text_embeddings).flatten()
54
  df["similarity"] = scores
55
+
56
+ top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n)
57
+
58
+
59
 
60
+ # Format markdown output
 
61
  markdown_output = ""
62
  for _, row in top_results.iterrows():
63
  markdown_output += f"### [{row['Title']}]({row['Link']})\n"
 
67
  return markdown_output
68
 
69
 
70
+
71
+
72
+
73
+
74
  iface = gr.Interface(
75
  fn=retrieve_records,
76
  inputs=[
77
+ gr.Dropdown(choices=query_options, label="Select a query"),
 
78
  ],
79
  outputs=gr.Markdown(label="Top Similar Records"),
80
+ title="Top-N Article Retriever"
81
  )
82
 
83
 
84
+ # iface = gr.Interface(
85
+ # fn=retrieve_records,
86
+ # inputs=[
87
+ # gr.Textbox(label="Enter your query"),
88
+ # gr.Slider(5, 15, step=5, label="Top N results")
89
+ # ],
90
+ # outputs=gr.Markdown(label="Top Similar Records"),
91
+ # title="Top-N Article Retriever with Clickable Links"
92
+ # )
93
+
94
+
95
 
96
  if __name__ == "__main__":
97
  iface.launch()