File size: 2,168 Bytes
91c172b
 
 
 
 
0f5de25
 
91c172b
7839fd1
a004bc9
91c172b
7839fd1
c8ce42a
76a7278
91c172b
1648d60
0f5de25
 
 
1648d60
0f5de25
 
 
 
 
 
7839fd1
91c172b
 
685c014
 
 
b4ff1f7
0f5de25
de5ad6a
0f5de25
 
 
 
 
 
 
685c014
0f5de25
b371eb9
1e8841b
0f5de25
 
 
 
1e8841b
0f5de25
685c014
b4ff1f7
685c014
 
 
 
 
 
b4ff1f7
0f5de25
 
b4ff1f7
685c014
91c172b
0f5de25
91c172b
685c014
0f5de25
91c172b
b4ff1f7
de5ad6a
3040f3c
34a3208
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import gradio as gr
from datetime import datetime
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re


# Load dataset 
df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"])

# Preprocessing
df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')
df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str)  + " " + df["Content"].astype(str)

# Loading query CSV
query_df = pd.read_csv("query.csv")
query_df.dropna(subset=["Topic", "Subtopic", "TopN"], inplace=True)

# dropdown options
query_df["QueryOption"] = query_df.apply(
    lambda row: f"{row['Topic']} - {row['Subtopic']} (TopN: {int(row['TopN'])})", axis=1
)

query_options = query_df["QueryOption"].tolist()

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)



def retrieve_records(selected_query):

    match = re.match(r"(.+?) - (.+?) \(TopN: (\d+)\)", selected_query)
    if not match:
        return "Invalid query format selected."
    
    topic, subtopic, top_n = match.groups()
    top_n = int(top_n)
    full_query = f"{topic} {subtopic}"
    
    query_embedding = model.encode([full_query], convert_to_tensor=False)
    scores = cosine_similarity(query_embedding, text_embeddings).flatten()
    df["similarity"] = scores

    top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n)


    
    # Format markdown output
    markdown_output = ""
    for _, row in top_results.iterrows():
        markdown_output += f"### [{row['Title']}]({row['Link']})\n"
        markdown_output += f"**Date**: {row['Date'].strftime('%Y-%m-%d')}\n\n"
        markdown_output += f"{row['Description']}\n\n---\n"

    return markdown_output




iface = gr.Interface(
    fn=retrieve_records,
    inputs=[
        gr.Dropdown(choices=query_options, label="Select a query"),
    ],
    outputs=gr.Markdown(label="Top Similar Records"),
    title="Top-N Article Retriever"
)


if __name__ == "__main__":
    iface.launch()