import pandas as pd import gradio as gr from datetime import datetime from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # Global variables df = None text_embeddings = None model = None results_dict = {} # Lazy load model and data def load_resources(): global df, text_embeddings, model if df is None: df = pd.read_csv("analyticsvidhyacomplete.csv", parse_dates=["Date"]) df["Date"] = pd.to_datetime(df["Date"], format='mixed', dayfirst=True, errors='coerce') df["combined_text"] = ( df["Title"].astype(str) + " " + df["Description"].astype(str) + " " + df["Content"].astype(str) ) if model is None: model = SentenceTransformer("all-MiniLM-L6-v2") if text_embeddings is None: text_embeddings = model.encode(df["combined_text"].tolist(), convert_to_tensor=False) def format_markdown(top_results): markdown_output = "" for _, row in top_results.iterrows(): title = row['Title'] link = row['Link'] desc = row['Description'] date_str = row['Date'].strftime('%Y-%m-%d') if pd.notnull(row['Date']) else 'N/A' markdown_output += f"### [{title}]({link})\n" markdown_output += f"**Date**: {date_str}\n\n" markdown_output += f"{desc}\n\n---\n" return markdown_output def process_query_csv(query_file): global results_dict results_dict = {} load_resources() # Ensure model/data is loaded query_df = pd.read_csv(query_file.name) for idx, row in query_df.iterrows(): topic = row.get("Topic", "") subtopic = row.get("Subtopic", "") top_n = int(row.get("TopN", 5)) query = f"Top {top_n} articles about {subtopic} in {topic}" query_embedding = model.encode([query], convert_to_tensor=False) scores = cosine_similarity(query_embedding, text_embeddings).flatten() df["similarity"] = scores top_results = df.sort_values(by=["similarity", "Date"], ascending=[False, False]).head(top_n) label = f"{topic} - {subtopic} (Top {top_n})" results_dict[label] = format_markdown(top_results) if results_dict: keys = list(results_dict.keys()) first_output = results_dict[keys[0]] else: keys = [] first_output = "### No results found." return keys, first_output def display_result(selected_query): return results_dict.get(selected_query, "No results found.") # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 📄 Batch Query Article Retriever with Clickable Links") query_input = gr.File(label="Upload Query CSV (Topic, Subtopic, TopN)") run_btn = gr.Button("Run Retrieval") dropdown = gr.Dropdown(label="Select Query") output_md = gr.Markdown() run_btn.click(fn=process_query_csv, inputs=query_input, outputs=[dropdown, output_md]) dropdown.change(fn=display_result, inputs=dropdown, outputs=output_md) if __name__ == "__main__": demo.launch()