File size: 2,841 Bytes
1c0b0c8 901da07 80bd751 901da07 b4007c4 80bd751 901da07 490594a a80d717 490594a 901da07 80bd751 901da07 80bd751 901da07 80bd751 901da07 4973591 901da07 95fdd2d 4973591 95fdd2d fb35b1f 901da07 4973591 901da07 c92f52a 901da07 a80d717 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | # imports
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Load the Sentence Transformer model once and store it in a variable
model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose a different model if preferred
# Function to process the uploaded file and find top 5 matching notes
def find_matching_notes(uploaded_file, user_input):
# Read the uploaded CSV file
if uploaded_file is not None:
df = pd.read_csv(uploaded_file.name)
# Ensure the necessary columns are present
if not {'Source', 'Section', 'Notes'}.issubset(df.columns):
return "The uploaded file must contain 'Source', 'Section', and 'Notes' columns."
# Check for NaN values in 'Notes' and 'Section'
if df[['Notes', 'Section']].isnull().any().any():
# Fill NaN values with empty strings
df['Notes'].fillna('', inplace=True)
df['Section'].fillna('', inplace=True)
# Combine 'Notes' and 'Section' for processing
df['Combined'] = df['Notes'] + ' ' + df['Section']
# Encode the combined text using the Sentence Transformer
all_texts = df['Combined'].tolist() + [user_input]
embeddings = model.encode(all_texts, convert_to_tensor=True)
# Compute cosine similarity
cosine_similarities = cosine_similarity(embeddings[-1].unsqueeze(0), embeddings[:-1])
# Get the top 5 indices of the most similar entries
top_indices = cosine_similarities[0].argsort()[-5:][::-1]
# Prepare the results in a readable format
results = df.iloc[top_indices][['Notes', 'Source', 'Section']]
formatted_results = []
for index, row in results.iterrows():
formatted_results.append(
f"**Notes:** {row['Notes']}\n"
f"**Source:** {row['Source']}\n"
f"**Section:** {row['Section']}\n"
"-------------------------------------\n"
)
return "\n".join(formatted_results)
return "Please upload a valid CSV file."
# Create Gradio interface
iface = gr.Interface(
fn=find_matching_notes,
inputs=[
gr.File(label="Upload Research Notes (CSV)"),
gr.Textbox(label="Enter your text here", placeholder="Type your content...")
],
outputs=gr.Textbox(label="Top 5 Matching Entries", lines=15, placeholder="Results will be displayed here..."),
title="Research Notes Matcher",
description="Upload a CSV file with 'Source', 'Section', and 'Notes'. Enter your text to find the top 5 matching notes. For documentation check the Readme file in the files section"
)
# Launch the app
iface.launch()
|