Spaces:

pavan-genai
/

personalized-learning

Sleeping

App Files Files Community

pavan-genai commited on May 30, 2025

Commit

7905ce7

verified ·

1 Parent(s): 5f155ac

Create app.py

Browse files

Files changed (1) hide show

app.py +214 -0

app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import pandas as pd
+import numpy as np
+import re
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from datasets import load_dataset
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.sparse import hstack, csr_matrix
+import gradio as gr
+# --- 1. Data Loading and Initial Exploration ---
+def load_and_explore_data():
+    """
+    Loads the Coursera course dataset and performs initial data exploration.
+    Returns the loaded DataFrame.
+    """
+    print("Loading dataset...")
+    ds = load_dataset("azrai99/coursera-course-dataset")
+    df = ds['train'].to_pandas()
+    print("Dataset loaded successfully.")
+    return df
+# --- 2. Text Preprocessing Utilities ---
+def download_nltk_data():
+    """Downloads necessary NLTK data if not already present."""
+    try:
+        stopwords.words('english')
+    except LookupError:
+        nltk.download('stopwords')
+    try:
+        word_tokenize("test")
+    except LookupError:
+        nltk.download('punkt')
+    try:
+        WordNetLemmatizer().lemmatize("test")
+    except LookupError:
+        nltk.download('wordnet')
+        nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer
+def clean_text(text):
+    """Converts text to lowercase and removes punctuation."""
+    text = str(text).lower()
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+def process_tokens(tokens, stop_words, lemmatizer):
+    """Removes stopwords and performs lemmatization on a list of tokens."""
+    tokens = [word for word in tokens if word not in stop_words]
+    tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return tokens
+# --- 3. Skill Standardization and Encoding ---
+def standardize_skill(skill):
+    """Standardizes a skill name (lowercase, strip, alphanumeric only)."""
+    skill = skill.lower().strip()
+    skill = ''.join(c for c in skill if c.isalnum())
+    return skill
+def load_synonym_mapping(filepath="synonyms.json"):
+    try:
+        with open(filepath, "r") as f:
+            synonym_mapping = json.load(f)
+    except FileNotFoundError:
+        print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
+        synonym_mapping = {}
+    return synonym_mapping
+def map_synonyms(skill, synonym_mapping):
+    """Maps a skill to its canonical form using the synonym mapping."""
+    return synonym_mapping.get(skill, skill)
+def process_course_skills(skills_string, synonym_mapping):
+    """Processes skills string: standardization, splitting, and synonym mapping."""
+    if pd.isna(skills_string): # Handle NaN values in Skills column
+        return []
+    skills_list = [s.strip() for s in skills_string.split(',')]
+    standardized_skills = [standardize_skill(s) for s in skills_list]
+    mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
+    return mapped_skills
+def multi_hot_encode_skills(skills, all_unique_skills):
+    """Multi-hot encodes a list of skills based on a global vocabulary."""
+    encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
+    return encoding
+# --- 4. Feature Engineering ---
+def engineer_features(df):
+    """
+    Performs text preprocessing, skill standardization, and combines features
+    into a single matrix for similarity calculation.
+    """
+    print("\nStarting feature engineering...")
+    # Initialize NLTK components
+    stop_words = set(stopwords.words('english'))
+    lemmatizer = WordNetLemmatizer()
+    synonym_mapping = load_synonym_mapping()
+    # Text processing
+    df['Description'] = df['Description'].fillna('No Description')
+    df['title'] = df['title'].fillna('No Title')
+    df['text'] = df['title'] + ' ' + df['Description']
+    df['text'] = df['text'].apply(clean_text)
+    df['tokens'] = df['text'].apply(word_tokenize)
+    df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
+    df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
+    # Create a copy of the original title for display
+    df['coarse_title'] = df['title']
+    # Skill processing
+    df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))
+    # Building skill vocabulary
+    all_skills = []
+    for skills in df['skills_list']:
+        all_skills.extend(skills)
+    unique_skills = sorted(list(set(all_skills)))
+    df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))
+    # TF-IDF Vectorization for text
+    text_vectorizer = TfidfVectorizer()
+    text_vectors = text_vectorizer.fit_transform(df['processed_text'])
+    # Convert skills_encoded to sparse matrix
+    skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))
+    # Combine text vectors and skills vectors
+    combined_features = hstack([text_vectors, skills_encoded_matrix])
+    print("Feature engineering complete.")
+    return df, combined_features, unique_skills, text_vectorizer
+# --- 5. Recommendation System Logic ---
+def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
+    """
+    Recommends courses based on a search query, considering both skills and text.
+    Returns the specified columns of the top N recommended courses.
+    """
+    synonym_mapping = load_synonym_mapping() # Load mapping for query processing
+    # Process query
+    standardized_query = standardize_skill(query)
+    mapped_query = map_synonyms(standardized_query, synonym_mapping)
+    # Create skill vector for the query
+    query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
+    query_skill_matrix = csr_matrix(np.array([query_skill_vector]))
+    # Vectorize the query text
+    query_text_vector = text_vectorizer.transform([standardized_query])
+    # Combine skill and text vectors for the query
+    query_combined = hstack([query_text_vector, query_skill_matrix])
+    # Calculate cosine similarity
+    similarities = cosine_similarity(query_combined, combined_features).flatten()
+    # Get top N courses
+    top_indices = similarities.argsort()[-top_n:][::-1]
+    # Select and sort top courses
+    top_courses = data.iloc[top_indices][[
+        'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
+        'num_reviews', 'Instructor', 'Organization', 'URL'
+    ]]
+    # Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
+    top_courses = top_courses.sort_values(
+        by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
+    )
+    return top_courses
+# --- 6. Gradio Interface ---
+def predict_courses(query):
+    """Gradio interface function to predict and display recommended courses."""
+    recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
+                                           GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
+    return recommended_courses.to_html(escape=False, index=False)
+# --- Main Execution Block ---
+if __name__ == "__main__":
+    print("Initializing course recommendation system...")
+    download_nltk_data()
+    GLOBAL_DF = load_and_explore_data()
+    GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)
+    print("\nSystem ready. Launching Gradio interface...")
+    iface = gr.Interface(
+        fn=predict_courses,
+        inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
+        outputs=gr.HTML(label="Recommended Courses"),
+        title="Personalized Course Recommendation System",
+        description="Enter a skill to get recommended courses based on content and skills."
+    )
+    iface.launch()