Spaces:

fortuala
/

CitingLLM

Build error

File size: 2,841 Bytes

# imports
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the Sentence Transformer model once and store it in a variable
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose a different model if preferred

# Function to process the uploaded file and find top 5 matching notes
def find_matching_notes(uploaded_file, user_input):
    # Read the uploaded CSV file
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file.name)
        
        # Ensure the necessary columns are present
        if not {'Source', 'Section', 'Notes'}.issubset(df.columns):
            return "The uploaded file must contain 'Source', 'Section', and 'Notes' columns."
        
        # Check for NaN values in 'Notes' and 'Section'
        if df[['Notes', 'Section']].isnull().any().any():
            # Fill NaN values with empty strings
            df['Notes'].fillna('', inplace=True)
            df['Section'].fillna('', inplace=True)
        
        # Combine 'Notes' and 'Section' for processing
        df['Combined'] = df['Notes'] + ' ' + df['Section']
        
        # Encode the combined text using the Sentence Transformer
        all_texts = df['Combined'].tolist() + [user_input]
        embeddings = model.encode(all_texts, convert_to_tensor=True)
        
        # Compute cosine similarity
        cosine_similarities = cosine_similarity(embeddings[-1].unsqueeze(0), embeddings[:-1])
        
        # Get the top 5 indices of the most similar entries
        top_indices = cosine_similarities[0].argsort()[-5:][::-1]
        
        # Prepare the results in a readable format
        results = df.iloc[top_indices][['Notes', 'Source', 'Section']]
        formatted_results = []
        
        for index, row in results.iterrows():
            formatted_results.append(
                f"**Notes:** {row['Notes']}\n"
                f"**Source:** {row['Source']}\n"
                f"**Section:** {row['Section']}\n"
                "-------------------------------------\n"
            )
        
        return "\n".join(formatted_results)
    
    return "Please upload a valid CSV file."

# Create Gradio interface
iface = gr.Interface(
    fn=find_matching_notes,
    inputs=[
        gr.File(label="Upload Research Notes (CSV)"),
        gr.Textbox(label="Enter your text here", placeholder="Type your content...")
    ],
    outputs=gr.Textbox(label="Top 5 Matching Entries", lines=15, placeholder="Results will be displayed here..."),
    title="Research Notes Matcher",
    description="Upload a CSV file with 'Source', 'Section', and 'Notes'. Enter your text to find the top 5 matching notes. For documentation check the Readme file in the files section"
)

# Launch the app
iface.launch()