Spaces:

pavan-genai
/

personalized-learning

Sleeping

App Files Files Community

personalized-learning / app.py

pavan-genai

Update app.py

d77901d verified 8 months ago

raw

history blame contribute delete

7.8 kB

	import pandas as pd
	import numpy as np
	import re
	import json

	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer

	from datasets import load_dataset
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from scipy.sparse import hstack, csr_matrix

	import gradio as gr

	# --- 1. Data Loading and Initial Exploration ---

	def load_and_explore_data():
	"""
	Loads the Coursera course dataset and performs initial data exploration.
	Returns the loaded DataFrame.
	"""
	print("Loading dataset...")
	ds = load_dataset("azrai99/coursera-course-dataset")
	df = ds['train'].to_pandas()
	print("Dataset loaded successfully.")
	return df
	# --- 2. Text Preprocessing Utilities ---

	def download_nltk_data():
	"""Downloads necessary NLTK data if not already present."""
	try:
	stopwords.words('english')
	except LookupError:
	nltk.download('stopwords')
	try:
	WordNetLemmatizer().lemmatize("test")
	except LookupError:
	nltk.download('wordnet')
	nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer

	def clean_text(text):
	"""Converts text to lowercase and removes punctuation."""
	text = str(text).lower()
	text = re.sub(r'[^\w\s]', '', text)
	return text

	def simple_tokenize(text):
	"""Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
	return re.findall(r'\b\w+\b', text)

	def process_tokens(tokens, stop_words, lemmatizer):
	"""Removes stopwords and performs lemmatization on a list of tokens."""
	tokens = [word for word in tokens if word not in stop_words]
	tokens = [lemmatizer.lemmatize(word) for word in tokens]
	return tokens

	# --- 3. Skill Standardization and Encoding ---

	def standardize_skill(skill):
	"""Standardizes a skill name (lowercase, strip, alphanumeric only)."""
	skill = skill.lower().strip()
	skill = ''.join(c for c in skill if c.isalnum())
	return skill

	def load_synonym_mapping(filepath="synonyms.json"):
	try:
	with open(filepath, "r") as f:
	synonym_mapping = json.load(f)
	except FileNotFoundError:
	print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
	synonym_mapping = {}
	return synonym_mapping

	def map_synonyms(skill, synonym_mapping):
	"""Maps a skill to its canonical form using the synonym mapping."""
	return synonym_mapping.get(skill, skill)

	def process_course_skills(skills_string, synonym_mapping):
	"""Processes skills string: standardization, splitting, and synonym mapping."""
	if pd.isna(skills_string): # Handle NaN values in Skills column
	return []
	skills_list = [s.strip() for s in skills_string.split(',')]
	standardized_skills = [standardize_skill(s) for s in skills_list]
	mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
	return mapped_skills

	def multi_hot_encode_skills(skills, all_unique_skills):
	"""Multi-hot encodes a list of skills based on a global vocabulary."""
	encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
	return encoding

	# --- 4. Feature Engineering ---

	def engineer_features(df):
	"""
	Performs text preprocessing, skill standardization, and combines features
	into a single matrix for similarity calculation.
	"""
	print("\nStarting feature engineering...")

	# Initialize NLTK components
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()
	synonym_mapping = load_synonym_mapping()

	# Text processing
	df['Description'] = df['Description'].fillna('No Description')
	df['title'] = df['title'].fillna('No Title')
	df['text'] = df['title'] + ' ' + df['Description']
	df['text'] = df['text'].apply(clean_text)
	df['tokens'] = df['text'].apply(simple_tokenize)
	df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
	df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

	# Create a copy of the original title for display
	df['coarse_title'] = df['title']

	# Skill processing
	df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))

	# Building skill vocabulary
	all_skills = []
	for skills in df['skills_list']:
	all_skills.extend(skills)
	unique_skills = sorted(list(set(all_skills)))

	df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))

	# TF-IDF Vectorization for text
	text_vectorizer = TfidfVectorizer()
	text_vectors = text_vectorizer.fit_transform(df['processed_text'])

	# Convert skills_encoded to sparse matrix
	skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))

	# Combine text vectors and skills vectors
	combined_features = hstack([text_vectors, skills_encoded_matrix])
	print("Feature engineering complete.")

	return df, combined_features, unique_skills, text_vectorizer

	# --- 5. Recommendation System Logic ---

	def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
	"""
	Recommends courses based on a search query, considering both skills and text.
	Returns the specified columns of the top N recommended courses.
	"""
	synonym_mapping = load_synonym_mapping() # Load mapping for query processing

	# Process query
	standardized_query = standardize_skill(query)
	mapped_query = map_synonyms(standardized_query, synonym_mapping)

	# Create skill vector for the query
	query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
	query_skill_matrix = csr_matrix(np.array([query_skill_vector]))

	# Vectorize the query text
	query_text_vector = text_vectorizer.transform([standardized_query])

	# Combine skill and text vectors for the query
	query_combined = hstack([query_text_vector, query_skill_matrix])

	# Calculate cosine similarity
	similarities = cosine_similarity(query_combined, combined_features).flatten()

	# Get top N courses
	top_indices = similarities.argsort()[-top_n:][::-1]

	# Select and sort top courses
	top_courses = data.iloc[top_indices][[
	'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
	'num_reviews', 'Instructor', 'Organization', 'URL'
	]]

	# Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
	top_courses = top_courses.sort_values(
	by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
	)

	return top_courses

	# --- 6. Gradio Interface ---

	def predict_courses(query):
	"""Gradio interface function to predict and display recommended courses."""
	recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
	GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
	return recommended_courses.to_html(escape=False, index=False)

	# --- Main Execution Block ---

	if __name__ == "__main__":
	print("Initializing course recommendation system...")
	download_nltk_data()
	GLOBAL_DF = load_and_explore_data()
	GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)

	print("\nSystem ready. Launching Gradio interface...")
	iface = gr.Interface(
	fn=predict_courses,
	inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
	outputs=gr.HTML(label="Recommended Courses"),
	title="Personalized Course Recommendation System",
	description="Enter a skill to get recommended courses based on content and skills."
	)
	iface.launch()