File size: 7,799 Bytes
7905ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d77901d
 
 
 
7905ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d77901d
7905ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import pandas as pd
import numpy as np
import re
import json

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

import gradio as gr

# --- 1. Data Loading and Initial Exploration ---

def load_and_explore_data():
    """
    Loads the Coursera course dataset and performs initial data exploration.
    Returns the loaded DataFrame.
    """
    print("Loading dataset...")
    ds = load_dataset("azrai99/coursera-course-dataset")
    df = ds['train'].to_pandas()
    print("Dataset loaded successfully.")
    return df
# --- 2. Text Preprocessing Utilities ---

def download_nltk_data():
    """Downloads necessary NLTK data if not already present."""
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
    try:
        WordNetLemmatizer().lemmatize("test")
    except LookupError:
        nltk.download('wordnet')
        nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer

def clean_text(text):
    """Converts text to lowercase and removes punctuation."""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def simple_tokenize(text):
    """Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
    return re.findall(r'\b\w+\b', text)

def process_tokens(tokens, stop_words, lemmatizer):
    """Removes stopwords and performs lemmatization on a list of tokens."""
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# --- 3. Skill Standardization and Encoding ---

def standardize_skill(skill):
    """Standardizes a skill name (lowercase, strip, alphanumeric only)."""
    skill = skill.lower().strip()
    skill = ''.join(c for c in skill if c.isalnum())
    return skill

def load_synonym_mapping(filepath="synonyms.json"):
    try:
        with open(filepath, "r") as f:
            synonym_mapping = json.load(f)
    except FileNotFoundError:
        print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
        synonym_mapping = {}
    return synonym_mapping

def map_synonyms(skill, synonym_mapping):
    """Maps a skill to its canonical form using the synonym mapping."""
    return synonym_mapping.get(skill, skill)

def process_course_skills(skills_string, synonym_mapping):
    """Processes skills string: standardization, splitting, and synonym mapping."""
    if pd.isna(skills_string): # Handle NaN values in Skills column
        return []
    skills_list = [s.strip() for s in skills_string.split(',')]
    standardized_skills = [standardize_skill(s) for s in skills_list]
    mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
    return mapped_skills

def multi_hot_encode_skills(skills, all_unique_skills):
    """Multi-hot encodes a list of skills based on a global vocabulary."""
    encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
    return encoding

# --- 4. Feature Engineering ---

def engineer_features(df):
    """
    Performs text preprocessing, skill standardization, and combines features
    into a single matrix for similarity calculation.
    """
    print("\nStarting feature engineering...")

    # Initialize NLTK components
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    synonym_mapping = load_synonym_mapping()

    # Text processing
    df['Description'] = df['Description'].fillna('No Description')
    df['title'] = df['title'].fillna('No Title')
    df['text'] = df['title'] + ' ' + df['Description']
    df['text'] = df['text'].apply(clean_text)
    df['tokens'] = df['text'].apply(simple_tokenize)
    df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
    df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

    # Create a copy of the original title for display
    df['coarse_title'] = df['title']

    # Skill processing
    df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))

    # Building skill vocabulary
    all_skills = []
    for skills in df['skills_list']:
        all_skills.extend(skills)
    unique_skills = sorted(list(set(all_skills)))

    df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))

    # TF-IDF Vectorization for text
    text_vectorizer = TfidfVectorizer()
    text_vectors = text_vectorizer.fit_transform(df['processed_text'])

    # Convert skills_encoded to sparse matrix
    skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))

    # Combine text vectors and skills vectors
    combined_features = hstack([text_vectors, skills_encoded_matrix])
    print("Feature engineering complete.")

    return df, combined_features, unique_skills, text_vectorizer

# --- 5. Recommendation System Logic ---

def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
    """
    Recommends courses based on a search query, considering both skills and text.
    Returns the specified columns of the top N recommended courses.
    """
    synonym_mapping = load_synonym_mapping() # Load mapping for query processing

    # Process query
    standardized_query = standardize_skill(query)
    mapped_query = map_synonyms(standardized_query, synonym_mapping)

    # Create skill vector for the query
    query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
    query_skill_matrix = csr_matrix(np.array([query_skill_vector]))

    # Vectorize the query text
    query_text_vector = text_vectorizer.transform([standardized_query])

    # Combine skill and text vectors for the query
    query_combined = hstack([query_text_vector, query_skill_matrix])

    # Calculate cosine similarity
    similarities = cosine_similarity(query_combined, combined_features).flatten()

    # Get top N courses
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Select and sort top courses
    top_courses = data.iloc[top_indices][[
        'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
        'num_reviews', 'Instructor', 'Organization', 'URL'
    ]]

    # Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
    top_courses = top_courses.sort_values(
        by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
    )

    return top_courses

# --- 6. Gradio Interface ---

def predict_courses(query):
    """Gradio interface function to predict and display recommended courses."""
    recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
                                           GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
    return recommended_courses.to_html(escape=False, index=False)

# --- Main Execution Block ---

if __name__ == "__main__":
    print("Initializing course recommendation system...")
    download_nltk_data()
    GLOBAL_DF = load_and_explore_data()
    GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)

    print("\nSystem ready. Launching Gradio interface...")
    iface = gr.Interface(
        fn=predict_courses,
        inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
        outputs=gr.HTML(label="Recommended Courses"),
        title="Personalized Course Recommendation System",
        description="Enter a skill to get recommended courses based on content and skills."
    )
    iface.launch()