rxhulshxrmx commited on
Commit
a8b3f9e
ยท
verified ยท
1 Parent(s): 2417025

Update course_search.py

Browse files
Files changed (1) hide show
  1. course_search.py +19 -86
course_search.py CHANGED
@@ -8,7 +8,6 @@ from transformers import AutoModel, AutoTokenizer
8
  class CourseSearchSystem:
9
  def __init__(self):
10
  self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
- # Load model and tokenizer directly using transformers
12
  self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
13
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
14
  self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
@@ -22,18 +21,13 @@ class CourseSearchSystem:
22
 
23
  def get_embeddings(self, texts: List[str]) -> np.ndarray:
24
  """Get embeddings for a list of texts"""
25
- # Tokenize sentences
26
  encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
27
  encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
28
 
29
- # Compute token embeddings
30
  with torch.no_grad():
31
  model_output = self.model(**encoded_input)
32
 
33
- # Perform pooling
34
  sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
35
-
36
- # Normalize embeddings
37
  sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
38
 
39
  return sentence_embeddings.cpu().numpy()
@@ -71,65 +65,33 @@ class CourseSearchSystem:
71
  """Load and prepare the course data and generate embeddings"""
72
  self.courses_df = self.prepare_course_data(df)
73
  self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())
74
-
75
- def _get_course_emoji(self, course_name: str) -> str:
76
- """Return appropriate emoji based on course topic"""
77
- emoji_mapping = {
78
- 'machine learning': '๐Ÿค–',
79
- 'deep learning': '๐Ÿง ',
80
- 'python': '๐Ÿ',
81
- 'data': '๐Ÿ“Š',
82
- 'nlp': '๐Ÿ“',
83
- 'computer vision': '๐Ÿ‘๏ธ',
84
- 'genai': 'โœจ',
85
- 'ethics': 'โš–๏ธ',
86
- 'statistics': '๐Ÿ“ˆ',
87
- 'visualization': '๐Ÿ“Š',
88
- 'neural': '๐Ÿ”ฎ',
89
- 'ai': '๐Ÿค–'
90
- }
91
-
92
- for key, emoji in emoji_mapping.items():
93
- if key in course_name:
94
- return emoji
95
- return '๐Ÿ“š' # Default emoji for other courses
96
 
97
  def generate_response(self, query: str, results: List[Dict]) -> str:
98
- """Generate an enhanced natural language response with course recommendations"""
99
  response_parts = []
100
 
101
- # Dynamic introduction based on number of results
102
  if len(results) == 1:
103
- response_parts.append(f"๐Ÿ“š I found an excellent free course matching your search for '{query}':")
104
  else:
105
- response_parts.append(f"๐Ÿ“š I found {len(results)} relevant free courses matching your search for '{query}':")
106
 
107
  # Course details
108
  for i, result in enumerate(results, 1):
109
- # Format course name with emoji based on topic
110
  course_name = result['course_name']
111
- emoji = self._get_course_emoji(course_name.lower())
112
- course_section = f"\n{emoji} **{i}. {course_name}**\n"
113
 
114
- # Clean rating display (out of 5)
115
  rating = result['ratings']
116
  rating_display = f"{rating}/5.0"
117
- stars = "โ˜…" * int(rating) + ("ยฝ" if rating % 1 >= 0.5 else "")
118
- stars = stars.ljust(5, "โ˜†")
119
- course_section += f"**Rating:** {stars} ({rating_display})\n"
120
 
121
- # Add difficulty with color-coded emoji
122
- difficulty_emoji = {
123
- 'Beginner': '๐ŸŸข',
124
- 'Intermediate': '๐ŸŸก',
125
- 'Advanced': '๐Ÿ”ด'
126
- }.get(result['difficulty'], 'โšช')
127
- course_section += f"**Level:** {difficulty_emoji} {result['difficulty']}\n"
128
 
129
  # Add duration if available
130
  if result['course_time']:
131
- duration_emoji = 'โฑ๏ธ'
132
- course_section += f"**Duration:** {duration_emoji} {result['course_time']} hours\n"
133
 
134
  # Format key takeaways with bullet points
135
  if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
@@ -139,27 +101,26 @@ class CourseSearchSystem:
139
  for takeaway in takeaways:
140
  cleaned = takeaway.strip('. ,')
141
  if cleaned:
142
- # Limit takeaway length for better readability
143
  if len(cleaned) > 100:
144
  cleaned = cleaned[:97] + "..."
145
  formatted_takeaways.append(f"โ€ข {cleaned}")
146
- course_section += "\n".join(formatted_takeaways[:3]) # Limit to top 3 takeaways
147
 
148
  if len(takeaways) > 3:
149
- course_section += "\nโ€ข *And more...*"
150
 
151
  # Add relevance score as a percentage
152
  similarity_percentage = int(result['similarity_score'] * 100)
153
- course_section += f"\n**Match Score:** {'๐ŸŽฏ' if similarity_percentage > 90 else '๐Ÿ“Š'} {similarity_percentage}%"
154
 
155
- # Add course link with clear CTA
156
- course_section += f"\n\nโžก๏ธ [Start Learning Now]({result['url']})\n"
157
 
158
  response_parts.append(course_section)
159
 
160
- # Add helpful conclusion with next steps
161
  response_parts.append("\n---\n")
162
- response_parts.append("๐Ÿ’ก **Pro Tips:**")
163
  response_parts.append("โ€ข Courses are sorted by relevance to your search")
164
  response_parts.append("โ€ข All courses are free and include hands-on projects")
165
  response_parts.append("โ€ข Certificates are provided upon completion")
@@ -168,16 +129,9 @@ class CourseSearchSystem:
168
 
169
  def search_courses(self, query: str, top_k: int = 5) -> str:
170
  """Search for courses and return formatted response"""
171
- # Preprocess query
172
  query = self.preprocess_text(query)
173
-
174
- # Generate query embedding
175
  query_embedding = self.get_embeddings([query])[0]
176
-
177
- # Calculate similarities
178
  similarities = np.dot(self.course_embeddings, query_embedding)
179
-
180
- # Get top k results
181
  top_indices = np.argsort(similarities)[-top_k:][::-1]
182
 
183
  results = []
@@ -185,7 +139,7 @@ class CourseSearchSystem:
185
  course = self.courses_df.iloc[idx]
186
  results.append({
187
  'course_name': course['Course Name'],
188
- 'key_takeaways': course['Key Takeaways'],
189
  'course_time': course['Course Time'],
190
  'ratings': course['Ratings'],
191
  'difficulty': course['Difficulty'],
@@ -193,25 +147,4 @@ class CourseSearchSystem:
193
  'url': course['Website']
194
  })
195
 
196
- # Generate formatted response
197
- return self.generate_response(query, results)
198
-
199
-
200
- if __name__ == "__main__":
201
- df = pd.read_csv('course_data.csv')
202
- search_system = CourseSearchSystem()
203
- search_system.load_and_prepare_data(df)
204
-
205
- test_queries = [
206
- "machine learning for beginners",
207
- "natural language processing",
208
- "computer vision courses",
209
- "data preprocessing tutorials",
210
- "generative AI learning"
211
- ]
212
-
213
- for query in test_queries:
214
- print(f"\nTesting query: '{query}'\n")
215
- response = search_system.search_courses(query, top_k=3)
216
- print(response)
217
- print("\n" + "="*80 + "\n")
 
8
  class CourseSearchSystem:
9
  def __init__(self):
10
  self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
11
  self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
12
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
13
  self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
 
21
 
22
  def get_embeddings(self, texts: List[str]) -> np.ndarray:
23
  """Get embeddings for a list of texts"""
 
24
  encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
25
  encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
26
 
 
27
  with torch.no_grad():
28
  model_output = self.model(**encoded_input)
29
 
 
30
  sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
 
 
31
  sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
32
 
33
  return sentence_embeddings.cpu().numpy()
 
65
  """Load and prepare the course data and generate embeddings"""
66
  self.courses_df = self.prepare_course_data(df)
67
  self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def generate_response(self, query: str, results: List[Dict]) -> str:
70
+ """Generate a professional response with course recommendations"""
71
  response_parts = []
72
 
73
+ # Introduction based on number of results
74
  if len(results) == 1:
75
+ response_parts.append(f"I found an excellent free course matching your search for '{query}':")
76
  else:
77
+ response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':")
78
 
79
  # Course details
80
  for i, result in enumerate(results, 1):
 
81
  course_name = result['course_name']
82
+ course_section = f"\n**{i}. {course_name}**\n"
 
83
 
84
+ # Clean rating display
85
  rating = result['ratings']
86
  rating_display = f"{rating}/5.0"
87
+ course_section += f"**Rating:** {rating_display}\n"
 
 
88
 
89
+ # Add difficulty
90
+ course_section += f"**Level:** {result['difficulty']}\n"
 
 
 
 
 
91
 
92
  # Add duration if available
93
  if result['course_time']:
94
+ course_section += f"**Duration:** {result['course_time']} hours\n"
 
95
 
96
  # Format key takeaways with bullet points
97
  if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
 
101
  for takeaway in takeaways:
102
  cleaned = takeaway.strip('. ,')
103
  if cleaned:
 
104
  if len(cleaned) > 100:
105
  cleaned = cleaned[:97] + "..."
106
  formatted_takeaways.append(f"โ€ข {cleaned}")
107
+ course_section += "\n".join(formatted_takeaways[:3])
108
 
109
  if len(takeaways) > 3:
110
+ course_section += "\nโ€ข And more..."
111
 
112
  # Add relevance score as a percentage
113
  similarity_percentage = int(result['similarity_score'] * 100)
114
+ course_section += f"\n**Match Score:** {similarity_percentage}%"
115
 
116
+ # Add course link
117
+ course_section += f"\n\n[Start Course]({result['url']})\n"
118
 
119
  response_parts.append(course_section)
120
 
121
+ # Add helpful conclusion
122
  response_parts.append("\n---\n")
123
+ response_parts.append("**Notes:**")
124
  response_parts.append("โ€ข Courses are sorted by relevance to your search")
125
  response_parts.append("โ€ข All courses are free and include hands-on projects")
126
  response_parts.append("โ€ข Certificates are provided upon completion")
 
129
 
130
  def search_courses(self, query: str, top_k: int = 5) -> str:
131
  """Search for courses and return formatted response"""
 
132
  query = self.preprocess_text(query)
 
 
133
  query_embedding = self.get_embeddings([query])[0]
 
 
134
  similarities = np.dot(self.course_embeddings, query_embedding)
 
 
135
  top_indices = np.argsort(similarities)[-top_k:][::-1]
136
 
137
  results = []
 
139
  course = self.courses_df.iloc[idx]
140
  results.append({
141
  'course_name': course['Course Name'],
142
+ 'key_takeaways': course['Key_Takeaways'],
143
  'course_time': course['Course Time'],
144
  'ratings': course['Ratings'],
145
  'difficulty': course['Difficulty'],
 
147
  'url': course['Website']
148
  })
149
 
150
+ return self.generate_response(query, results)