Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import json | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from datasets import load_dataset | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from scipy.sparse import hstack, csr_matrix | |
| import gradio as gr | |
| # --- 1. Data Loading and Initial Exploration --- | |
| def load_and_explore_data(): | |
| """ | |
| Loads the Coursera course dataset and performs initial data exploration. | |
| Returns the loaded DataFrame. | |
| """ | |
| print("Loading dataset...") | |
| ds = load_dataset("azrai99/coursera-course-dataset") | |
| df = ds['train'].to_pandas() | |
| print("Dataset loaded successfully.") | |
| return df | |
| # --- 2. Text Preprocessing Utilities --- | |
| def download_nltk_data(): | |
| """Downloads necessary NLTK data if not already present.""" | |
| try: | |
| stopwords.words('english') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| try: | |
| WordNetLemmatizer().lemmatize("test") | |
| except LookupError: | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer | |
| def clean_text(text): | |
| """Converts text to lowercase and removes punctuation.""" | |
| text = str(text).lower() | |
| text = re.sub(r'[^\w\s]', '', text) | |
| return text | |
| def simple_tokenize(text): | |
| """Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt).""" | |
| return re.findall(r'\b\w+\b', text) | |
| def process_tokens(tokens, stop_words, lemmatizer): | |
| """Removes stopwords and performs lemmatization on a list of tokens.""" | |
| tokens = [word for word in tokens if word not in stop_words] | |
| tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
| return tokens | |
| # --- 3. Skill Standardization and Encoding --- | |
| def standardize_skill(skill): | |
| """Standardizes a skill name (lowercase, strip, alphanumeric only).""" | |
| skill = skill.lower().strip() | |
| skill = ''.join(c for c in skill if c.isalnum()) | |
| return skill | |
| def load_synonym_mapping(filepath="synonyms.json"): | |
| try: | |
| with open(filepath, "r") as f: | |
| synonym_mapping = json.load(f) | |
| except FileNotFoundError: | |
| print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.") | |
| synonym_mapping = {} | |
| return synonym_mapping | |
| def map_synonyms(skill, synonym_mapping): | |
| """Maps a skill to its canonical form using the synonym mapping.""" | |
| return synonym_mapping.get(skill, skill) | |
| def process_course_skills(skills_string, synonym_mapping): | |
| """Processes skills string: standardization, splitting, and synonym mapping.""" | |
| if pd.isna(skills_string): # Handle NaN values in Skills column | |
| return [] | |
| skills_list = [s.strip() for s in skills_string.split(',')] | |
| standardized_skills = [standardize_skill(s) for s in skills_list] | |
| mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills] | |
| return mapped_skills | |
| def multi_hot_encode_skills(skills, all_unique_skills): | |
| """Multi-hot encodes a list of skills based on a global vocabulary.""" | |
| encoding = [1 if skill in skills else 0 for skill in all_unique_skills] | |
| return encoding | |
| # --- 4. Feature Engineering --- | |
| def engineer_features(df): | |
| """ | |
| Performs text preprocessing, skill standardization, and combines features | |
| into a single matrix for similarity calculation. | |
| """ | |
| print("\nStarting feature engineering...") | |
| # Initialize NLTK components | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| synonym_mapping = load_synonym_mapping() | |
| # Text processing | |
| df['Description'] = df['Description'].fillna('No Description') | |
| df['title'] = df['title'].fillna('No Title') | |
| df['text'] = df['title'] + ' ' + df['Description'] | |
| df['text'] = df['text'].apply(clean_text) | |
| df['tokens'] = df['text'].apply(simple_tokenize) | |
| df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer)) | |
| df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x)) | |
| # Create a copy of the original title for display | |
| df['coarse_title'] = df['title'] | |
| # Skill processing | |
| df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping)) | |
| # Building skill vocabulary | |
| all_skills = [] | |
| for skills in df['skills_list']: | |
| all_skills.extend(skills) | |
| unique_skills = sorted(list(set(all_skills))) | |
| df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills)) | |
| # TF-IDF Vectorization for text | |
| text_vectorizer = TfidfVectorizer() | |
| text_vectors = text_vectorizer.fit_transform(df['processed_text']) | |
| # Convert skills_encoded to sparse matrix | |
| skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist())) | |
| # Combine text vectors and skills vectors | |
| combined_features = hstack([text_vectors, skills_encoded_matrix]) | |
| print("Feature engineering complete.") | |
| return df, combined_features, unique_skills, text_vectorizer | |
| # --- 5. Recommendation System Logic --- | |
| def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10): | |
| """ | |
| Recommends courses based on a search query, considering both skills and text. | |
| Returns the specified columns of the top N recommended courses. | |
| """ | |
| synonym_mapping = load_synonym_mapping() # Load mapping for query processing | |
| # Process query | |
| standardized_query = standardize_skill(query) | |
| mapped_query = map_synonyms(standardized_query, synonym_mapping) | |
| # Create skill vector for the query | |
| query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills) | |
| query_skill_matrix = csr_matrix(np.array([query_skill_vector])) | |
| # Vectorize the query text | |
| query_text_vector = text_vectorizer.transform([standardized_query]) | |
| # Combine skill and text vectors for the query | |
| query_combined = hstack([query_text_vector, query_skill_matrix]) | |
| # Calculate cosine similarity | |
| similarities = cosine_similarity(query_combined, combined_features).flatten() | |
| # Get top N courses | |
| top_indices = similarities.argsort()[-top_n:][::-1] | |
| # Select and sort top courses | |
| top_courses = data.iloc[top_indices][[ | |
| 'coarse_title', 'Skills', 'Level', 'rating', 'enrolled', | |
| 'num_reviews', 'Instructor', 'Organization', 'URL' | |
| ]] | |
| # Sort by rating (descending), then number of reviews (descending), then enrolled (descending) | |
| top_courses = top_courses.sort_values( | |
| by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False] | |
| ) | |
| return top_courses | |
| # --- 6. Gradio Interface --- | |
| def predict_courses(query): | |
| """Gradio interface function to predict and display recommended courses.""" | |
| recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES, | |
| GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER) | |
| return recommended_courses.to_html(escape=False, index=False) | |
| # --- Main Execution Block --- | |
| if __name__ == "__main__": | |
| print("Initializing course recommendation system...") | |
| download_nltk_data() | |
| GLOBAL_DF = load_and_explore_data() | |
| GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF) | |
| print("\nSystem ready. Launching Gradio interface...") | |
| iface = gr.Interface( | |
| fn=predict_courses, | |
| inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"), | |
| outputs=gr.HTML(label="Recommended Courses"), | |
| title="Personalized Course Recommendation System", | |
| description="Enter a skill to get recommended courses based on content and skills." | |
| ) | |
| iface.launch() | |