File size: 3,267 Bytes
470bedd
12bc78d
 
0c9165b
 
12bc78d
 
 
 
 
 
 
f972e7c
12bc78d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

import pandas as pd
import gradio as gr
pip install sklearn 
pip install SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load dataset
df = pd.read_csv('nlp_test_data_final.csv')

def recommend_courses(user_skills, user_level, df):
    # Combine prerequisites and difficulty into a single feature
    df['combined_features'] = df['Prerequisites'].fillna('') + ', ' + df['course_difficulty'].fillna('')
    
    # Convert text data into numerical format using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['combined_features'])
    
    # Transform user input into the same format
    user_input = ', '.join(user_skills) + ', ' + user_level
    user_vector = vectorizer.transform([user_input])
    
    # Compute cosine similarity using TF-IDF
    similarities_tfidf = cosine_similarity(user_vector, tfidf_matrix).flatten()
    
    # Sentence BERT model for embeddings
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    course_embeddings = model.encode(df['combined_features'].tolist(), convert_to_numpy=True)
    user_embedding = model.encode([user_input], convert_to_numpy=True)
    
    # FAISS for vector similarity search
    index = faiss.IndexFlatL2(course_embeddings.shape[1])
    index.add(course_embeddings)
    _, similarities_faiss = index.search(user_embedding, len(df))
    similarities_faiss = 1 / (1 + similarities_faiss.flatten())  # Convert distances to similarity scores
    
    # Store similarity scores
    df['similarity_tfidf'] = similarities_tfidf
    df['similarity_bert'] = similarities_bert = cosine_similarity(user_embedding, course_embeddings).flatten()
    df['similarity_faiss'] = similarities_faiss
    
    # Sort by highest similarity score from any model
    recommended_courses = df.sort_values(by=['similarity_tfidf', 'similarity_bert', 'similarity_faiss'], ascending=False).head(5)
    
    # Plot similarity scores
    plt.figure(figsize=(8, 5))
    plt.bar(['TF-IDF', 'Sentence-BERT', 'FAISS'], [max(similarities_tfidf), max(similarities_bert), max(similarities_faiss)])
    plt.xlabel('Model')
    plt.ylabel('Max Similarity Score')
    plt.title('Comparison of Similarity Scores Across Models')
    plt.savefig('/content/similarity_comparison.png')
    
    return recommended_courses[['course_title', 'course_organization', 'course_difficulty', 'course_rating', 'similarity_tfidf', 'similarity_bert', 'similarity_faiss']]

def gradio_interface(user_skills, user_level):
    user_skills_list = user_skills.split(', ')
    recommended_courses = recommend_courses(user_skills_list, user_level, df)
    return recommended_courses

# Gradio UI
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Enter your skills (comma-separated)"), 
        gr.Dropdown(["Beginner", "Intermediate", "Mixed"], label="Select Difficulty Level")
    ],
    outputs=gr.Dataframe(headers=["Course Title", "Organization", "Difficulty", "Rating", "TF-IDF Score", "BERT Score", "FAISS Score"], label="Recommended Courses")
)

if __name__ == "__main__":
    iface.launch()