pavan-genai's picture
Update app.py
d77901d verified
import pandas as pd
import numpy as np
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
import gradio as gr
# --- 1. Data Loading and Initial Exploration ---
def load_and_explore_data():
"""
Loads the Coursera course dataset and performs initial data exploration.
Returns the loaded DataFrame.
"""
print("Loading dataset...")
ds = load_dataset("azrai99/coursera-course-dataset")
df = ds['train'].to_pandas()
print("Dataset loaded successfully.")
return df
# --- 2. Text Preprocessing Utilities ---
def download_nltk_data():
"""Downloads necessary NLTK data if not already present."""
try:
stopwords.words('english')
except LookupError:
nltk.download('stopwords')
try:
WordNetLemmatizer().lemmatize("test")
except LookupError:
nltk.download('wordnet')
nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer
def clean_text(text):
"""Converts text to lowercase and removes punctuation."""
text = str(text).lower()
text = re.sub(r'[^\w\s]', '', text)
return text
def simple_tokenize(text):
"""Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
return re.findall(r'\b\w+\b', text)
def process_tokens(tokens, stop_words, lemmatizer):
"""Removes stopwords and performs lemmatization on a list of tokens."""
tokens = [word for word in tokens if word not in stop_words]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return tokens
# --- 3. Skill Standardization and Encoding ---
def standardize_skill(skill):
"""Standardizes a skill name (lowercase, strip, alphanumeric only)."""
skill = skill.lower().strip()
skill = ''.join(c for c in skill if c.isalnum())
return skill
def load_synonym_mapping(filepath="synonyms.json"):
try:
with open(filepath, "r") as f:
synonym_mapping = json.load(f)
except FileNotFoundError:
print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
synonym_mapping = {}
return synonym_mapping
def map_synonyms(skill, synonym_mapping):
"""Maps a skill to its canonical form using the synonym mapping."""
return synonym_mapping.get(skill, skill)
def process_course_skills(skills_string, synonym_mapping):
"""Processes skills string: standardization, splitting, and synonym mapping."""
if pd.isna(skills_string): # Handle NaN values in Skills column
return []
skills_list = [s.strip() for s in skills_string.split(',')]
standardized_skills = [standardize_skill(s) for s in skills_list]
mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
return mapped_skills
def multi_hot_encode_skills(skills, all_unique_skills):
"""Multi-hot encodes a list of skills based on a global vocabulary."""
encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
return encoding
# --- 4. Feature Engineering ---
def engineer_features(df):
"""
Performs text preprocessing, skill standardization, and combines features
into a single matrix for similarity calculation.
"""
print("\nStarting feature engineering...")
# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
synonym_mapping = load_synonym_mapping()
# Text processing
df['Description'] = df['Description'].fillna('No Description')
df['title'] = df['title'].fillna('No Title')
df['text'] = df['title'] + ' ' + df['Description']
df['text'] = df['text'].apply(clean_text)
df['tokens'] = df['text'].apply(simple_tokenize)
df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
# Create a copy of the original title for display
df['coarse_title'] = df['title']
# Skill processing
df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))
# Building skill vocabulary
all_skills = []
for skills in df['skills_list']:
all_skills.extend(skills)
unique_skills = sorted(list(set(all_skills)))
df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))
# TF-IDF Vectorization for text
text_vectorizer = TfidfVectorizer()
text_vectors = text_vectorizer.fit_transform(df['processed_text'])
# Convert skills_encoded to sparse matrix
skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))
# Combine text vectors and skills vectors
combined_features = hstack([text_vectors, skills_encoded_matrix])
print("Feature engineering complete.")
return df, combined_features, unique_skills, text_vectorizer
# --- 5. Recommendation System Logic ---
def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
"""
Recommends courses based on a search query, considering both skills and text.
Returns the specified columns of the top N recommended courses.
"""
synonym_mapping = load_synonym_mapping() # Load mapping for query processing
# Process query
standardized_query = standardize_skill(query)
mapped_query = map_synonyms(standardized_query, synonym_mapping)
# Create skill vector for the query
query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
query_skill_matrix = csr_matrix(np.array([query_skill_vector]))
# Vectorize the query text
query_text_vector = text_vectorizer.transform([standardized_query])
# Combine skill and text vectors for the query
query_combined = hstack([query_text_vector, query_skill_matrix])
# Calculate cosine similarity
similarities = cosine_similarity(query_combined, combined_features).flatten()
# Get top N courses
top_indices = similarities.argsort()[-top_n:][::-1]
# Select and sort top courses
top_courses = data.iloc[top_indices][[
'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
'num_reviews', 'Instructor', 'Organization', 'URL'
]]
# Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
top_courses = top_courses.sort_values(
by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
)
return top_courses
# --- 6. Gradio Interface ---
def predict_courses(query):
"""Gradio interface function to predict and display recommended courses."""
recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
return recommended_courses.to_html(escape=False, index=False)
# --- Main Execution Block ---
if __name__ == "__main__":
print("Initializing course recommendation system...")
download_nltk_data()
GLOBAL_DF = load_and_explore_data()
GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)
print("\nSystem ready. Launching Gradio interface...")
iface = gr.Interface(
fn=predict_courses,
inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
outputs=gr.HTML(label="Recommended Courses"),
title="Personalized Course Recommendation System",
description="Enter a skill to get recommended courses based on content and skills."
)
iface.launch()