Spaces:
Sleeping
Sleeping
File size: 2,168 Bytes
91c172b 0f5de25 91c172b 7839fd1 a004bc9 91c172b 7839fd1 c8ce42a 76a7278 91c172b 1648d60 0f5de25 1648d60 0f5de25 7839fd1 91c172b 685c014 b4ff1f7 0f5de25 de5ad6a 0f5de25 685c014 0f5de25 b371eb9 1e8841b 0f5de25 1e8841b 0f5de25 685c014 b4ff1f7 685c014 b4ff1f7 0f5de25 b4ff1f7 685c014 91c172b 0f5de25 91c172b 685c014 0f5de25 91c172b b4ff1f7 de5ad6a 3040f3c 34a3208 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import pandas as pd
import gradio as gr
from datetime import datetime
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
# Load dataset
df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"])
# Preprocessing
df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce')
df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str) + " " + df["Content"].astype(str)
# Loading query CSV
query_df = pd.read_csv("query.csv")
query_df.dropna(subset=["Topic", "Subtopic", "TopN"], inplace=True)
# dropdown options
query_df["QueryOption"] = query_df.apply(
lambda row: f"{row['Topic']} - {row['Subtopic']} (TopN: {int(row['TopN'])})", axis=1
)
query_options = query_df["QueryOption"].tolist()
# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")
text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False)
def retrieve_records(selected_query):
match = re.match(r"(.+?) - (.+?) \(TopN: (\d+)\)", selected_query)
if not match:
return "Invalid query format selected."
topic, subtopic, top_n = match.groups()
top_n = int(top_n)
full_query = f"{topic} {subtopic}"
query_embedding = model.encode([full_query], convert_to_tensor=False)
scores = cosine_similarity(query_embedding, text_embeddings).flatten()
df["similarity"] = scores
top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n)
# Format markdown output
markdown_output = ""
for _, row in top_results.iterrows():
markdown_output += f"### [{row['Title']}]({row['Link']})\n"
markdown_output += f"**Date**: {row['Date'].strftime('%Y-%m-%d')}\n\n"
markdown_output += f"{row['Description']}\n\n---\n"
return markdown_output
iface = gr.Interface(
fn=retrieve_records,
inputs=[
gr.Dropdown(choices=query_options, label="Select a query"),
],
outputs=gr.Markdown(label="Top Similar Records"),
title="Top-N Article Retriever"
)
if __name__ == "__main__":
iface.launch()
|