pavan-genai commited on
Commit
7905ce7
·
verified ·
1 Parent(s): 5f155ac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -0
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import json
5
+
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.stem import WordNetLemmatizer
13
+
14
+ from datasets import load_dataset
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from scipy.sparse import hstack, csr_matrix
18
+
19
+ import gradio as gr
20
+
21
+ # --- 1. Data Loading and Initial Exploration ---
22
+
23
+ def load_and_explore_data():
24
+ """
25
+ Loads the Coursera course dataset and performs initial data exploration.
26
+ Returns the loaded DataFrame.
27
+ """
28
+ print("Loading dataset...")
29
+ ds = load_dataset("azrai99/coursera-course-dataset")
30
+ df = ds['train'].to_pandas()
31
+ print("Dataset loaded successfully.")
32
+ return df
33
+ # --- 2. Text Preprocessing Utilities ---
34
+
35
+ def download_nltk_data():
36
+ """Downloads necessary NLTK data if not already present."""
37
+ try:
38
+ stopwords.words('english')
39
+ except LookupError:
40
+ nltk.download('stopwords')
41
+ try:
42
+ word_tokenize("test")
43
+ except LookupError:
44
+ nltk.download('punkt')
45
+ try:
46
+ WordNetLemmatizer().lemmatize("test")
47
+ except LookupError:
48
+ nltk.download('wordnet')
49
+ nltk.download('omw-1.4') # Open Multilingual Wordnet for WordNetLemmatizer
50
+
51
+ def clean_text(text):
52
+ """Converts text to lowercase and removes punctuation."""
53
+ text = str(text).lower()
54
+ text = re.sub(r'[^\w\s]', '', text)
55
+ return text
56
+
57
+ def process_tokens(tokens, stop_words, lemmatizer):
58
+ """Removes stopwords and performs lemmatization on a list of tokens."""
59
+ tokens = [word for word in tokens if word not in stop_words]
60
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
61
+ return tokens
62
+
63
+ # --- 3. Skill Standardization and Encoding ---
64
+
65
+ def standardize_skill(skill):
66
+ """Standardizes a skill name (lowercase, strip, alphanumeric only)."""
67
+ skill = skill.lower().strip()
68
+ skill = ''.join(c for c in skill if c.isalnum())
69
+ return skill
70
+
71
+ def load_synonym_mapping(filepath="synonyms.json"):
72
+ try:
73
+ with open(filepath, "r") as f:
74
+ synonym_mapping = json.load(f)
75
+ except FileNotFoundError:
76
+ print(f"Warning: '{filepath}' not found. Proceeding without skill synonym mapping.")
77
+ synonym_mapping = {}
78
+ return synonym_mapping
79
+
80
+ def map_synonyms(skill, synonym_mapping):
81
+ """Maps a skill to its canonical form using the synonym mapping."""
82
+ return synonym_mapping.get(skill, skill)
83
+
84
+ def process_course_skills(skills_string, synonym_mapping):
85
+ """Processes skills string: standardization, splitting, and synonym mapping."""
86
+ if pd.isna(skills_string): # Handle NaN values in Skills column
87
+ return []
88
+ skills_list = [s.strip() for s in skills_string.split(',')]
89
+ standardized_skills = [standardize_skill(s) for s in skills_list]
90
+ mapped_skills = [map_synonyms(s, synonym_mapping) for s in standardized_skills]
91
+ return mapped_skills
92
+
93
+ def multi_hot_encode_skills(skills, all_unique_skills):
94
+ """Multi-hot encodes a list of skills based on a global vocabulary."""
95
+ encoding = [1 if skill in skills else 0 for skill in all_unique_skills]
96
+ return encoding
97
+
98
+ # --- 4. Feature Engineering ---
99
+
100
+ def engineer_features(df):
101
+ """
102
+ Performs text preprocessing, skill standardization, and combines features
103
+ into a single matrix for similarity calculation.
104
+ """
105
+ print("\nStarting feature engineering...")
106
+
107
+ # Initialize NLTK components
108
+ stop_words = set(stopwords.words('english'))
109
+ lemmatizer = WordNetLemmatizer()
110
+ synonym_mapping = load_synonym_mapping()
111
+
112
+ # Text processing
113
+ df['Description'] = df['Description'].fillna('No Description')
114
+ df['title'] = df['title'].fillna('No Title')
115
+ df['text'] = df['title'] + ' ' + df['Description']
116
+ df['text'] = df['text'].apply(clean_text)
117
+ df['tokens'] = df['text'].apply(word_tokenize)
118
+ df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
119
+ df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
120
+
121
+ # Create a copy of the original title for display
122
+ df['coarse_title'] = df['title']
123
+
124
+ # Skill processing
125
+ df['skills_list'] = df['Skills'].apply(lambda x: process_course_skills(x, synonym_mapping))
126
+
127
+ # Building skill vocabulary
128
+ all_skills = []
129
+ for skills in df['skills_list']:
130
+ all_skills.extend(skills)
131
+ unique_skills = sorted(list(set(all_skills)))
132
+
133
+ df['skills_encoded'] = df['skills_list'].apply(lambda x: multi_hot_encode_skills(x, unique_skills))
134
+
135
+ # TF-IDF Vectorization for text
136
+ text_vectorizer = TfidfVectorizer()
137
+ text_vectors = text_vectorizer.fit_transform(df['processed_text'])
138
+
139
+ # Convert skills_encoded to sparse matrix
140
+ skills_encoded_matrix = csr_matrix(np.array(df['skills_encoded'].tolist()))
141
+
142
+ # Combine text vectors and skills vectors
143
+ combined_features = hstack([text_vectors, skills_encoded_matrix])
144
+ print("Feature engineering complete.")
145
+
146
+ return df, combined_features, unique_skills, text_vectorizer
147
+
148
+ # --- 5. Recommendation System Logic ---
149
+
150
+ def recommend_courses(query, data, combined_features, unique_skills, text_vectorizer, top_n=10):
151
+ """
152
+ Recommends courses based on a search query, considering both skills and text.
153
+ Returns the specified columns of the top N recommended courses.
154
+ """
155
+ synonym_mapping = load_synonym_mapping() # Load mapping for query processing
156
+
157
+ # Process query
158
+ standardized_query = standardize_skill(query)
159
+ mapped_query = map_synonyms(standardized_query, synonym_mapping)
160
+
161
+ # Create skill vector for the query
162
+ query_skill_vector = multi_hot_encode_skills([mapped_query], unique_skills)
163
+ query_skill_matrix = csr_matrix(np.array([query_skill_vector]))
164
+
165
+ # Vectorize the query text
166
+ query_text_vector = text_vectorizer.transform([standardized_query])
167
+
168
+ # Combine skill and text vectors for the query
169
+ query_combined = hstack([query_text_vector, query_skill_matrix])
170
+
171
+ # Calculate cosine similarity
172
+ similarities = cosine_similarity(query_combined, combined_features).flatten()
173
+
174
+ # Get top N courses
175
+ top_indices = similarities.argsort()[-top_n:][::-1]
176
+
177
+ # Select and sort top courses
178
+ top_courses = data.iloc[top_indices][[
179
+ 'coarse_title', 'Skills', 'Level', 'rating', 'enrolled',
180
+ 'num_reviews', 'Instructor', 'Organization', 'URL'
181
+ ]]
182
+
183
+ # Sort by rating (descending), then number of reviews (descending), then enrolled (descending)
184
+ top_courses = top_courses.sort_values(
185
+ by=['rating', 'num_reviews', 'enrolled'], ascending=[False, False, False]
186
+ )
187
+
188
+ return top_courses
189
+
190
+ # --- 6. Gradio Interface ---
191
+
192
+ def predict_courses(query):
193
+ """Gradio interface function to predict and display recommended courses."""
194
+ recommended_courses = recommend_courses(query, GLOBAL_DF, GLOBAL_COMBINED_FEATURES,
195
+ GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER)
196
+ return recommended_courses.to_html(escape=False, index=False)
197
+
198
+ # --- Main Execution Block ---
199
+
200
+ if __name__ == "__main__":
201
+ print("Initializing course recommendation system...")
202
+ download_nltk_data()
203
+ GLOBAL_DF = load_and_explore_data()
204
+ GLOBAL_DF, GLOBAL_COMBINED_FEATURES, GLOBAL_UNIQUE_SKILLS, GLOBAL_TEXT_VECTORIZER = engineer_features(GLOBAL_DF)
205
+
206
+ print("\nSystem ready. Launching Gradio interface...")
207
+ iface = gr.Interface(
208
+ fn=predict_courses,
209
+ inputs=gr.Textbox(label="Enter a skill (e.g., Python, Machine Learning):"),
210
+ outputs=gr.HTML(label="Recommended Courses"),
211
+ title="Personalized Course Recommendation System",
212
+ description="Enter a skill to get recommended courses based on content and skills."
213
+ )
214
+ iface.launch()