Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import gradio as gr | |
| from datetime import datetime | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Global variables | |
| df = None | |
| text_embeddings = None | |
| model = None | |
| results_dict = {} | |
| # Lazy load model and data | |
| def load_resources(): | |
| global df, text_embeddings, model | |
| if df is None: | |
| df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"]) | |
| df["Date"] = pd.to_datetime(df["Date"], format='mixed', dayfirst=True, errors='coerce') | |
| df["combined_text"] = ( | |
| df["Title"].astype(str) + " " + | |
| df["Description"].astype(str) + " " + | |
| df["Content"].astype(str) | |
| ) | |
| if model is None: | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| if text_embeddings is None: | |
| text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False) | |
| def format_markdown(top_results): | |
| markdown_output = "" | |
| for _, row in top_results.iterrows(): | |
| title = row['Title'] | |
| link = row['Link'] | |
| desc = row['Description'] | |
| date_str = row['Date'].strftime('%Y-%m-%d') if pd.notnull(row['Date']) else 'N/A' | |
| markdown_output += f"### [{title}]({link})\n" | |
| markdown_output += f"**Date**: {date_str}\n\n" | |
| markdown_output += f"{desc}\n\n---\n" | |
| return markdown_output | |
| def process_query_csv(query_file): | |
| global results_dict | |
| results_dict = {} | |
| load_resources() # Ensure model/data is loaded | |
| query_df = pd.read_csv(query_file.name) | |
| for idx, row in query_df.iterrows(): | |
| topic = row.get("Topic", "") | |
| subtopic = row.get("Subtopic", "") | |
| top_n = int(row.get("TopN", 5)) | |
| query = f"Top {top_n} articles about {subtopic} in {topic}" | |
| query_embedding = model.encode([query], convert_to_tensor=False) | |
| scores = cosine_similarity(query_embedding, text_embeddings).flatten() | |
| df["similarity"] = scores | |
| top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n) | |
| label = f"{topic} - {subtopic} (Top {top_n})" | |
| results_dict[label] = format_markdown(top_results) | |
| if results_dict: | |
| keys = list(results_dict.keys()) | |
| first_output = results_dict[keys[0]] | |
| else: | |
| keys = [] | |
| first_output = "### No results found." | |
| return keys, first_output | |
| def display_result(selected_query): | |
| return results_dict.get(selected_query, "No results found.") | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Batch Query Article Retriever with Clickable Links") | |
| query_input = gr.File(label="Upload Query CSV (Topic, Subtopic, TopN)") | |
| run_btn = gr.Button("Run Retrieval") | |
| dropdown = gr.Dropdown(label="Select Query") | |
| output_md = gr.Markdown() | |
| run_btn.click(fn=process_query_csv, inputs=query_input, outputs=[dropdown, output_md]) | |
| dropdown.change(fn=display_result, inputs=dropdown, outputs=output_md) | |
| if __name__ == "__main__": | |
| demo.launch() | |