File size: 2,841 Bytes
1c0b0c8
901da07
 
80bd751
901da07
 
b4007c4
80bd751
 
901da07
 
 
 
 
 
 
 
 
 
490594a
 
a80d717
 
 
490594a
901da07
 
 
80bd751
901da07
80bd751
901da07
 
80bd751
901da07
 
 
 
4973591
901da07
95fdd2d
4973591
 
 
 
 
 
 
 
95fdd2d
 
fb35b1f
901da07
 
 
 
 
 
 
 
 
4973591
901da07
c92f52a
901da07
 
 
a80d717
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# imports
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the Sentence Transformer model once and store it in a variable
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose a different model if preferred

# Function to process the uploaded file and find top 5 matching notes
def find_matching_notes(uploaded_file, user_input):
    # Read the uploaded CSV file
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file.name)
        
        # Ensure the necessary columns are present
        if not {'Source', 'Section', 'Notes'}.issubset(df.columns):
            return "The uploaded file must contain 'Source', 'Section', and 'Notes' columns."
        
        # Check for NaN values in 'Notes' and 'Section'
        if df[['Notes', 'Section']].isnull().any().any():
            # Fill NaN values with empty strings
            df['Notes'].fillna('', inplace=True)
            df['Section'].fillna('', inplace=True)
        
        # Combine 'Notes' and 'Section' for processing
        df['Combined'] = df['Notes'] + ' ' + df['Section']
        
        # Encode the combined text using the Sentence Transformer
        all_texts = df['Combined'].tolist() + [user_input]
        embeddings = model.encode(all_texts, convert_to_tensor=True)
        
        # Compute cosine similarity
        cosine_similarities = cosine_similarity(embeddings[-1].unsqueeze(0), embeddings[:-1])
        
        # Get the top 5 indices of the most similar entries
        top_indices = cosine_similarities[0].argsort()[-5:][::-1]
        
        # Prepare the results in a readable format
        results = df.iloc[top_indices][['Notes', 'Source', 'Section']]
        formatted_results = []
        
        for index, row in results.iterrows():
            formatted_results.append(
                f"**Notes:** {row['Notes']}\n"
                f"**Source:** {row['Source']}\n"
                f"**Section:** {row['Section']}\n"
                "-------------------------------------\n"
            )
        
        return "\n".join(formatted_results)
    
    return "Please upload a valid CSV file."

# Create Gradio interface
iface = gr.Interface(
    fn=find_matching_notes,
    inputs=[
        gr.File(label="Upload Research Notes (CSV)"),
        gr.Textbox(label="Enter your text here", placeholder="Type your content...")
    ],
    outputs=gr.Textbox(label="Top 5 Matching Entries", lines=15, placeholder="Results will be displayed here..."),
    title="Research Notes Matcher",
    description="Upload a CSV file with 'Source', 'Section', and 'Notes'. Enter your text to find the top 5 matching notes. For documentation check the Readme file in the files section"
)

# Launch the app
iface.launch()