fortuala commited on
Commit
80bd751
·
verified ·
1 Parent(s): c92f52a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -1,8 +1,11 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
 
 
 
6
  # Function to process the uploaded file and find top 5 matching notes
7
  def find_matching_notes(uploaded_file, user_input):
8
  # Read the uploaded CSV file
@@ -22,13 +25,12 @@ def find_matching_notes(uploaded_file, user_input):
22
  # Combine 'Notes' and 'Section' for processing
23
  df['Combined'] = df['Notes'] + ' ' + df['Section']
24
 
25
- # Create TF-IDF vectorizer and transform the texts
26
- vectorizer = TfidfVectorizer()
27
  all_texts = df['Combined'].tolist() + [user_input]
28
- tfidf_matrix = vectorizer.fit_transform(all_texts)
29
 
30
  # Compute cosine similarity
31
- cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
32
 
33
  # Get the top 5 indices of the most similar entries
34
  top_indices = cosine_similarities[0].argsort()[-5:][::-1]
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
6
+ # Load the Sentence Transformer model
7
+ model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose a different model if preferred
8
+
9
  # Function to process the uploaded file and find top 5 matching notes
10
  def find_matching_notes(uploaded_file, user_input):
11
  # Read the uploaded CSV file
 
25
  # Combine 'Notes' and 'Section' for processing
26
  df['Combined'] = df['Notes'] + ' ' + df['Section']
27
 
28
+ # Encode the combined text using the Sentence Transformer
 
29
  all_texts = df['Combined'].tolist() + [user_input]
30
+ embeddings = model.encode(all_texts, convert_to_tensor=True)
31
 
32
  # Compute cosine similarity
33
+ cosine_similarities = cosine_similarity(embeddings[-1].unsqueeze(0), embeddings[:-1])
34
 
35
  # Get the top 5 indices of the most similar entries
36
  top_indices = cosine_similarities[0].argsort()[-5:][::-1]