tiya1012 commited on
Commit
12bc78d
·
verified ·
1 Parent(s): d45977c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ import matplotlib.pyplot as plt
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ import numpy as np
9
+
10
+ # Load dataset
11
+ df = pd.read_csv('/content/nlp_test_data_final.csv')
12
+
13
+ def recommend_courses(user_skills, user_level, df):
14
+ # Combine prerequisites and difficulty into a single feature
15
+ df['combined_features'] = df['Prerequisites'].fillna('') + ', ' + df['course_difficulty'].fillna('')
16
+
17
+ # Convert text data into numerical format using TF-IDF
18
+ vectorizer = TfidfVectorizer()
19
+ tfidf_matrix = vectorizer.fit_transform(df['combined_features'])
20
+
21
+ # Transform user input into the same format
22
+ user_input = ', '.join(user_skills) + ', ' + user_level
23
+ user_vector = vectorizer.transform([user_input])
24
+
25
+ # Compute cosine similarity using TF-IDF
26
+ similarities_tfidf = cosine_similarity(user_vector, tfidf_matrix).flatten()
27
+
28
+ # Sentence BERT model for embeddings
29
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
30
+ course_embeddings = model.encode(df['combined_features'].tolist(), convert_to_numpy=True)
31
+ user_embedding = model.encode([user_input], convert_to_numpy=True)
32
+
33
+ # FAISS for vector similarity search
34
+ index = faiss.IndexFlatL2(course_embeddings.shape[1])
35
+ index.add(course_embeddings)
36
+ _, similarities_faiss = index.search(user_embedding, len(df))
37
+ similarities_faiss = 1 / (1 + similarities_faiss.flatten()) # Convert distances to similarity scores
38
+
39
+ # Store similarity scores
40
+ df['similarity_tfidf'] = similarities_tfidf
41
+ df['similarity_bert'] = similarities_bert = cosine_similarity(user_embedding, course_embeddings).flatten()
42
+ df['similarity_faiss'] = similarities_faiss
43
+
44
+ # Sort by highest similarity score from any model
45
+ recommended_courses = df.sort_values(by=['similarity_tfidf', 'similarity_bert', 'similarity_faiss'], ascending=False).head(5)
46
+
47
+ # Plot similarity scores
48
+ plt.figure(figsize=(8, 5))
49
+ plt.bar(['TF-IDF', 'Sentence-BERT', 'FAISS'], [max(similarities_tfidf), max(similarities_bert), max(similarities_faiss)])
50
+ plt.xlabel('Model')
51
+ plt.ylabel('Max Similarity Score')
52
+ plt.title('Comparison of Similarity Scores Across Models')
53
+ plt.savefig('/content/similarity_comparison.png')
54
+
55
+ return recommended_courses[['course_title', 'course_organization', 'course_difficulty', 'course_rating', 'similarity_tfidf', 'similarity_bert', 'similarity_faiss']]
56
+
57
+ def gradio_interface(user_skills, user_level):
58
+ user_skills_list = user_skills.split(', ')
59
+ recommended_courses = recommend_courses(user_skills_list, user_level, df)
60
+ return recommended_courses
61
+
62
+ # Gradio UI
63
+ iface = gr.Interface(
64
+ fn=gradio_interface,
65
+ inputs=[
66
+ gr.Textbox(label="Enter your skills (comma-separated)"),
67
+ gr.Dropdown(["Beginner", "Intermediate", "Mixed"], label="Select Difficulty Level")
68
+ ],
69
+ outputs=gr.Dataframe(headers=["Course Title", "Organization", "Difficulty", "Rating", "TF-IDF Score", "BERT Score", "FAISS Score"], label="Recommended Courses")
70
+ )
71
+
72
+ if __name__ == "__main__":
73
+ iface.launch()