Spaces:

pavan-genai
/

personalized-learning

Sleeping

File size: 7,799 Bytes

import pandas as pd
import numpy as np
import re
import json

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

import gradio as gr

# --- 1. Data Loading and Initial Exploration ---

def load_and_explore_data():
    """
    Loads the Coursera course dataset and performs initial data exploration.
    Returns the loaded DataFrame.
    """
    print("Loading dataset...")
    ds = load_dataset("azrai99/coursera-course-dataset")
    df = ds['train'].to_pandas()
    print("Dataset loaded successfully.")
    return df
# --- 2. Text Preprocessing Utilities ---

def download_nltk_data():
    """Downloads necessary NLTK data if not already present."""
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
    try:
        WordNetLemmatizer().lemmatize("test")
    except LookupError:
        nltk.download('wordnet')
        nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer

def clean_text(text):
    """Converts text to lowercase and removes punctuation."""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def simple_tokenize(text):
    """Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
    return re.findall(r'\b\w+\b', text)

def process_tokens(tokens, stop_words, lemmatizer):
    """Removes stopwords and performs lemmatization on a list of tokens."""
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# --- 3. Skill Standardization and Encoding ---

def standardize_skill(skill):
    """Standardizes a skill name (lowercase, strip, alphanumeric only)."""
    skill = skill.lower().strip()
    skill = ''.join(c for c in skill if c.isalnum())
    return skill

def load_synonym_mapping(filepath="synonyms.json"):
    try:
        with open(filepath, "r") as f:
            synonym_mapping = json.load(f)
    except FileNotFoundError:
        print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
        synonym_mapping = {}
    return synonym_mapping

def map_synonyms(skill, synonym_mapping):
    """Maps a skill to its canonical form using the synonym mapping."""
    return synonym_mapping.get(skill, skill)

def process_course_skills(skills_string, synonym_mapping):
    """Processes skills string: standardization, splitting, and synonym mapping."""
    if pd.isna(skills_string): # Handle NaN values in Skills column
        return []
    skills_list = [s.strip() for s in skills_string.split(',')]
    standardized_skills = [standardize_skill(s) for s in skills_list]
    mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
    return mapped_skills

def multi_hot_encode_skills(skills, all_unique_skills):
    """Multi-hot encodes a list of skills based on a global vocabulary."""
    encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
    return encoding

# --- 4. Feature Engineering ---

def engineer_features(df):
    """
    Performs text preprocessing, skill standardization, and combines features
    into a single matrix for similarity calculation.
    """
    print("\nStarting feature engineering...")

    # Initialize NLTK components
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    synonym_mapping = load_synonym_mapping()

    # Text processing
    df['Description'] = df['Description'].fillna('No Description')
    df['title'] = df['title'].fillna('No Title')
    df['text'] = df['title'] + ' ' + df['Description']
    df['text'] = df['text'].apply(clean_text)
    df['tokens'] = df['text'].apply(simple_tokenize)
    df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
    df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

    # Create a copy of the original title for display
    df['coarse_title'] = df['title']

    # Skill processing
    df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))

    # Building skill vocabulary
    all_skills = []
    for skills in df['skills_list']:
        all_skills.extend(skills)
    unique_skills = sorted(list(set(all_skills)))

    df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))

    # TF-IDF Vectorization for text
    text_vectorizer = TfidfVectorizer()
    text_vectors = text_vectorizer.fit_transform(df['processed_text'])

    # Convert skills_encoded to sparse matrix
    skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))

    # Combine text vectors and skills vectors
    combined_features = hstack([text_vectors, skills_encoded_matrix])
    print("Feature engineering complete.")

    return df, combined_features, unique_skills, text_vectorizer

# --- 5. Recommendation System Logic ---

def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
    """
    Recommends courses based on a search query, considering both skills and text.
    Returns the specified columns of the top N recommended courses.
    """
    synonym_mapping = load_synonym_mapping() # Load mapping for query processing

    # Process query
    standardized_query = standardize_skill(query)
    mapped_query = map_synonyms(standardized_query, synonym_mapping)

    # Create skill vector for the query
    query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
    query_skill_matrix = csr_matrix(np.array([query_skill_vector]))

    # Vectorize the query text
    query_text_vector = text_vectorizer.transform([standardized_query])

    # Combine skill and text vectors for the query
    query_combined = hstack([query_text_vector, query_skill_matrix])

    # Calculate cosine similarity
    similarities = cosine_similarity(query_combined, combined_features).flatten()

    # Get top N courses
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Select and sort top courses
    top_courses = data.iloc[top_indices][[
        'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
        'num_reviews', 'Instructor', 'Organization', 'URL'
    ]]

    # Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
    top_courses = top_courses.sort_values(
        by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
    )

    return top_courses

# --- 6. Gradio Interface ---

def predict_courses(query):
    """Gradio interface function to predict and display recommended courses."""
    recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
                                           GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
    return recommended_courses.to_html(escape=False, index=False)

# --- Main Execution Block ---

if __name__ == "__main__":
    print("Initializing course recommendation system...")
    download_nltk_data()
    GLOBAL_DF = load_and_explore_data()
    GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)

    print("\nSystem ready. Launching Gradio interface...")
    iface = gr.Interface(
        fn=predict_courses,
        inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
        outputs=gr.HTML(label="Recommended Courses"),
        title="Personalized Course Recommendation System",
        description="Enter a skill to get recommended courses based on content and skills."
    )
    iface.launch()