import pandas as pd import numpy as np import re import json import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from datasets import load_dataset from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from scipy.sparse import hstack, csr_matrix import gradio as gr # --- 1. Data Loading and Initial Exploration --- def load_and_explore_data(): """ Loads the Coursera course dataset and performs initial data exploration. Returns the loaded DataFrame. """ print("Loading dataset...") ds = load_dataset("azrai99/coursera-course-dataset") df = ds['train'].to_pandas() print("Dataset loaded successfully.") return df # --- 2. Text Preprocessing Utilities --- def download_nltk_data(): """Downloads necessary NLTK data if not already present.""" try: stopwords.words('english') except LookupError: nltk.download('stopwords') try: WordNetLemmatizer().lemmatize("test") except LookupError: nltk.download('wordnet') nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer def clean_text(text): """Converts text to lowercase and removes punctuation.""" text = str(text).lower() text = re.sub(r'[^\w\s]', '', text) return text def simple_tokenize(text): """Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt).""" return re.findall(r'\b\w+\b', text) def process_tokens(tokens, stop_words, lemmatizer): """Removes stopwords and performs lemmatization on a list of tokens.""" tokens = [word for word in tokens if word not in stop_words] tokens = [lemmatizer.lemmatize(word) for word in tokens] return tokens # --- 3. Skill Standardization and Encoding --- def standardize_skill(skill): """Standardizes a skill name (lowercase, strip, alphanumeric only).""" skill = skill.lower().strip() skill = ''.join(c for c in skill if c.isalnum()) return skill def load_synonym_mapping(filepath="synonyms.json"): try: with open(filepath, "r") as f: synonym_mapping = json.load(f) except FileNotFoundError: print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.") synonym_mapping = {} return synonym_mapping def map_synonyms(skill, synonym_mapping): """Maps a skill to its canonical form using the synonym mapping.""" return synonym_mapping.get(skill, skill) def process_course_skills(skills_string, synonym_mapping): """Processes skills string: standardization, splitting, and synonym mapping.""" if pd.isna(skills_string): # Handle NaN values in Skills column return [] skills_list = [s.strip() for s in skills_string.split(',')] standardized_skills = [standardize_skill(s) for s in skills_list] mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills] return mapped_skills def multi_hot_encode_skills(skills, all_unique_skills): """Multi-hot encodes a list of skills based on a global vocabulary.""" encoding = [1 if skill in skills else 0 for skill in all_unique_skills] return encoding # --- 4. Feature Engineering --- def engineer_features(df): """ Performs text preprocessing, skill standardization, and combines features into a single matrix for similarity calculation. """ print("\nStarting feature engineering...") # Initialize NLTK components stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() synonym_mapping = load_synonym_mapping() # Text processing df['Description'] = df['Description'].fillna('No Description') df['title'] = df['title'].fillna('No Title') df['text'] = df['title'] + ' ' + df['Description'] df['text'] = df['text'].apply(clean_text) df['tokens'] = df['text'].apply(simple_tokenize) df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer)) df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x)) # Create a copy of the original title for display df['coarse_title'] = df['title'] # Skill processing df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping)) # Building skill vocabulary all_skills = [] for skills in df['skills_list']: all_skills.extend(skills) unique_skills = sorted(list(set(all_skills))) df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills)) # TF-IDF Vectorization for text text_vectorizer = TfidfVectorizer() text_vectors = text_vectorizer.fit_transform(df['processed_text']) # Convert skills_encoded to sparse matrix skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist())) # Combine text vectors and skills vectors combined_features = hstack([text_vectors, skills_encoded_matrix]) print("Feature engineering complete.") return df, combined_features, unique_skills, text_vectorizer # --- 5. Recommendation System Logic --- def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10): """ Recommends courses based on a search query, considering both skills and text. Returns the specified columns of the top N recommended courses. """ synonym_mapping = load_synonym_mapping() # Load mapping for query processing # Process query standardized_query = standardize_skill(query) mapped_query = map_synonyms(standardized_query, synonym_mapping) # Create skill vector for the query query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills) query_skill_matrix = csr_matrix(np.array([query_skill_vector])) # Vectorize the query text query_text_vector = text_vectorizer.transform([standardized_query]) # Combine skill and text vectors for the query query_combined = hstack([query_text_vector, query_skill_matrix]) # Calculate cosine similarity similarities = cosine_similarity(query_combined, combined_features).flatten() # Get top N courses top_indices = similarities.argsort()[-top_n:][::-1] # Select and sort top courses top_courses = data.iloc[top_indices][[ 'coarse_title', 'Skills', 'Level', 'rating', 'enrolled', 'num_reviews', 'Instructor', 'Organization', 'URL' ]] # Sort by rating (descending), then number of reviews (descending), then enrolled (descending) top_courses = top_courses.sort_values( by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False] ) return top_courses # --- 6. Gradio Interface --- def predict_courses(query): """Gradio interface function to predict and display recommended courses.""" recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER) return recommended_courses.to_html(escape=False, index=False) # --- Main Execution Block --- if __name__ == "__main__": print("Initializing course recommendation system...") download_nltk_data() GLOBAL_DF = load_and_explore_data() GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF) print("\nSystem ready. Launching Gradio interface...") iface = gr.Interface( fn=predict_courses, inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"), outputs=gr.HTML(label="Recommended Courses"), title="Personalized Course Recommendation System", description="Enter a skill to get recommended courses based on content and skills." ) iface.launch()