Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import gradio as gr | |
| from datetime import datetime | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import re | |
| # Load dataset | |
| df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"]) | |
| # Preprocessing | |
| df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True, errors='coerce') | |
| df["combined_text"] = df["Title"].astype(str) + " " + df["Description"].astype(str) + " " + df["Content"].astype(str) | |
| # Loading query CSV | |
| query_df = pd.read_csv("query.csv") | |
| query_df.dropna(subset=["Topic", "Subtopic", "TopN"], inplace=True) | |
| # dropdown options | |
| query_df["QueryOption"] = query_df.apply( | |
| lambda row: f"{row['Topic']} - {row['Subtopic']} (TopN: {int(row['TopN'])})", axis=1 | |
| ) | |
| query_options = query_df["QueryOption"].tolist() | |
| # Load model | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False) | |
| def retrieve_records(selected_query): | |
| match = re.match(r"(.+?) - (.+?) \(TopN: (\d+)\)", selected_query) | |
| if not match: | |
| return "Invalid query format selected." | |
| topic, subtopic, top_n = match.groups() | |
| top_n = int(top_n) | |
| full_query = f"{topic} {subtopic}" | |
| query_embedding = model.encode([full_query], convert_to_tensor=False) | |
| scores = cosine_similarity(query_embedding, text_embeddings).flatten() | |
| df["similarity"] = scores | |
| top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n) | |
| # Format markdown output | |
| markdown_output = "" | |
| for _, row in top_results.iterrows(): | |
| markdown_output += f"### [{row['Title']}]({row['Link']})\n" | |
| markdown_output += f"**Date**: {row['Date'].strftime('%Y-%m-%d')}\n\n" | |
| markdown_output += f"{row['Description']}\n\n---\n" | |
| return markdown_output | |
| iface = gr.Interface( | |
| fn=retrieve_records, | |
| inputs=[ | |
| gr.Dropdown(choices=query_options, label="Select a query"), | |
| ], | |
| outputs=gr.Markdown(label="Top Similar Records"), | |
| title="Top-N Article Retriever" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |