Spaces:
Sleeping
Sleeping
Update course_search.py
Browse files- course_search.py +19 -86
course_search.py
CHANGED
|
@@ -8,7 +8,6 @@ from transformers import AutoModel, AutoTokenizer
|
|
| 8 |
class CourseSearchSystem:
|
| 9 |
def __init__(self):
|
| 10 |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 11 |
-
# Load model and tokenizer directly using transformers
|
| 12 |
self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
|
| 13 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 14 |
self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
|
|
@@ -22,18 +21,13 @@ class CourseSearchSystem:
|
|
| 22 |
|
| 23 |
def get_embeddings(self, texts: List[str]) -> np.ndarray:
|
| 24 |
"""Get embeddings for a list of texts"""
|
| 25 |
-
# Tokenize sentences
|
| 26 |
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
|
| 27 |
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
| 28 |
|
| 29 |
-
# Compute token embeddings
|
| 30 |
with torch.no_grad():
|
| 31 |
model_output = self.model(**encoded_input)
|
| 32 |
|
| 33 |
-
# Perform pooling
|
| 34 |
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
| 35 |
-
|
| 36 |
-
# Normalize embeddings
|
| 37 |
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
|
| 38 |
|
| 39 |
return sentence_embeddings.cpu().numpy()
|
|
@@ -71,65 +65,33 @@ class CourseSearchSystem:
|
|
| 71 |
"""Load and prepare the course data and generate embeddings"""
|
| 72 |
self.courses_df = self.prepare_course_data(df)
|
| 73 |
self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())
|
| 74 |
-
|
| 75 |
-
def _get_course_emoji(self, course_name: str) -> str:
|
| 76 |
-
"""Return appropriate emoji based on course topic"""
|
| 77 |
-
emoji_mapping = {
|
| 78 |
-
'machine learning': '๐ค',
|
| 79 |
-
'deep learning': '๐ง ',
|
| 80 |
-
'python': '๐',
|
| 81 |
-
'data': '๐',
|
| 82 |
-
'nlp': '๐',
|
| 83 |
-
'computer vision': '๐๏ธ',
|
| 84 |
-
'genai': 'โจ',
|
| 85 |
-
'ethics': 'โ๏ธ',
|
| 86 |
-
'statistics': '๐',
|
| 87 |
-
'visualization': '๐',
|
| 88 |
-
'neural': '๐ฎ',
|
| 89 |
-
'ai': '๐ค'
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
for key, emoji in emoji_mapping.items():
|
| 93 |
-
if key in course_name:
|
| 94 |
-
return emoji
|
| 95 |
-
return '๐' # Default emoji for other courses
|
| 96 |
|
| 97 |
def generate_response(self, query: str, results: List[Dict]) -> str:
|
| 98 |
-
"""Generate
|
| 99 |
response_parts = []
|
| 100 |
|
| 101 |
-
#
|
| 102 |
if len(results) == 1:
|
| 103 |
-
response_parts.append(f"
|
| 104 |
else:
|
| 105 |
-
response_parts.append(f"
|
| 106 |
|
| 107 |
# Course details
|
| 108 |
for i, result in enumerate(results, 1):
|
| 109 |
-
# Format course name with emoji based on topic
|
| 110 |
course_name = result['course_name']
|
| 111 |
-
|
| 112 |
-
course_section = f"\n{emoji} **{i}. {course_name}**\n"
|
| 113 |
|
| 114 |
-
# Clean rating display
|
| 115 |
rating = result['ratings']
|
| 116 |
rating_display = f"{rating}/5.0"
|
| 117 |
-
|
| 118 |
-
stars = stars.ljust(5, "โ")
|
| 119 |
-
course_section += f"**Rating:** {stars} ({rating_display})\n"
|
| 120 |
|
| 121 |
-
# Add difficulty
|
| 122 |
-
|
| 123 |
-
'Beginner': '๐ข',
|
| 124 |
-
'Intermediate': '๐ก',
|
| 125 |
-
'Advanced': '๐ด'
|
| 126 |
-
}.get(result['difficulty'], 'โช')
|
| 127 |
-
course_section += f"**Level:** {difficulty_emoji} {result['difficulty']}\n"
|
| 128 |
|
| 129 |
# Add duration if available
|
| 130 |
if result['course_time']:
|
| 131 |
-
|
| 132 |
-
course_section += f"**Duration:** {duration_emoji} {result['course_time']} hours\n"
|
| 133 |
|
| 134 |
# Format key takeaways with bullet points
|
| 135 |
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
|
|
@@ -139,27 +101,26 @@ class CourseSearchSystem:
|
|
| 139 |
for takeaway in takeaways:
|
| 140 |
cleaned = takeaway.strip('. ,')
|
| 141 |
if cleaned:
|
| 142 |
-
# Limit takeaway length for better readability
|
| 143 |
if len(cleaned) > 100:
|
| 144 |
cleaned = cleaned[:97] + "..."
|
| 145 |
formatted_takeaways.append(f"โข {cleaned}")
|
| 146 |
-
course_section += "\n".join(formatted_takeaways[:3])
|
| 147 |
|
| 148 |
if len(takeaways) > 3:
|
| 149 |
-
course_section += "\nโข
|
| 150 |
|
| 151 |
# Add relevance score as a percentage
|
| 152 |
similarity_percentage = int(result['similarity_score'] * 100)
|
| 153 |
-
course_section += f"\n**Match Score:** {
|
| 154 |
|
| 155 |
-
# Add course link
|
| 156 |
-
course_section += f"\n\n
|
| 157 |
|
| 158 |
response_parts.append(course_section)
|
| 159 |
|
| 160 |
-
# Add helpful conclusion
|
| 161 |
response_parts.append("\n---\n")
|
| 162 |
-
response_parts.append("
|
| 163 |
response_parts.append("โข Courses are sorted by relevance to your search")
|
| 164 |
response_parts.append("โข All courses are free and include hands-on projects")
|
| 165 |
response_parts.append("โข Certificates are provided upon completion")
|
|
@@ -168,16 +129,9 @@ class CourseSearchSystem:
|
|
| 168 |
|
| 169 |
def search_courses(self, query: str, top_k: int = 5) -> str:
|
| 170 |
"""Search for courses and return formatted response"""
|
| 171 |
-
# Preprocess query
|
| 172 |
query = self.preprocess_text(query)
|
| 173 |
-
|
| 174 |
-
# Generate query embedding
|
| 175 |
query_embedding = self.get_embeddings([query])[0]
|
| 176 |
-
|
| 177 |
-
# Calculate similarities
|
| 178 |
similarities = np.dot(self.course_embeddings, query_embedding)
|
| 179 |
-
|
| 180 |
-
# Get top k results
|
| 181 |
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
| 182 |
|
| 183 |
results = []
|
|
@@ -185,7 +139,7 @@ class CourseSearchSystem:
|
|
| 185 |
course = self.courses_df.iloc[idx]
|
| 186 |
results.append({
|
| 187 |
'course_name': course['Course Name'],
|
| 188 |
-
'key_takeaways': course['
|
| 189 |
'course_time': course['Course Time'],
|
| 190 |
'ratings': course['Ratings'],
|
| 191 |
'difficulty': course['Difficulty'],
|
|
@@ -193,25 +147,4 @@ class CourseSearchSystem:
|
|
| 193 |
'url': course['Website']
|
| 194 |
})
|
| 195 |
|
| 196 |
-
|
| 197 |
-
return self.generate_response(query, results)
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
if __name__ == "__main__":
|
| 201 |
-
df = pd.read_csv('course_data.csv')
|
| 202 |
-
search_system = CourseSearchSystem()
|
| 203 |
-
search_system.load_and_prepare_data(df)
|
| 204 |
-
|
| 205 |
-
test_queries = [
|
| 206 |
-
"machine learning for beginners",
|
| 207 |
-
"natural language processing",
|
| 208 |
-
"computer vision courses",
|
| 209 |
-
"data preprocessing tutorials",
|
| 210 |
-
"generative AI learning"
|
| 211 |
-
]
|
| 212 |
-
|
| 213 |
-
for query in test_queries:
|
| 214 |
-
print(f"\nTesting query: '{query}'\n")
|
| 215 |
-
response = search_system.search_courses(query, top_k=3)
|
| 216 |
-
print(response)
|
| 217 |
-
print("\n" + "="*80 + "\n")
|
|
|
|
| 8 |
class CourseSearchSystem:
|
| 9 |
def __init__(self):
|
| 10 |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
| 11 |
self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
|
| 12 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 13 |
self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
|
|
|
|
| 21 |
|
| 22 |
def get_embeddings(self, texts: List[str]) -> np.ndarray:
|
| 23 |
"""Get embeddings for a list of texts"""
|
|
|
|
| 24 |
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
|
| 25 |
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
| 26 |
|
|
|
|
| 27 |
with torch.no_grad():
|
| 28 |
model_output = self.model(**encoded_input)
|
| 29 |
|
|
|
|
| 30 |
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
|
|
|
|
|
|
| 31 |
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
|
| 32 |
|
| 33 |
return sentence_embeddings.cpu().numpy()
|
|
|
|
| 65 |
"""Load and prepare the course data and generate embeddings"""
|
| 66 |
self.courses_df = self.prepare_course_data(df)
|
| 67 |
self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def generate_response(self, query: str, results: List[Dict]) -> str:
|
| 70 |
+
"""Generate a professional response with course recommendations"""
|
| 71 |
response_parts = []
|
| 72 |
|
| 73 |
+
# Introduction based on number of results
|
| 74 |
if len(results) == 1:
|
| 75 |
+
response_parts.append(f"I found an excellent free course matching your search for '{query}':")
|
| 76 |
else:
|
| 77 |
+
response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':")
|
| 78 |
|
| 79 |
# Course details
|
| 80 |
for i, result in enumerate(results, 1):
|
|
|
|
| 81 |
course_name = result['course_name']
|
| 82 |
+
course_section = f"\n**{i}. {course_name}**\n"
|
|
|
|
| 83 |
|
| 84 |
+
# Clean rating display
|
| 85 |
rating = result['ratings']
|
| 86 |
rating_display = f"{rating}/5.0"
|
| 87 |
+
course_section += f"**Rating:** {rating_display}\n"
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
# Add difficulty
|
| 90 |
+
course_section += f"**Level:** {result['difficulty']}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Add duration if available
|
| 93 |
if result['course_time']:
|
| 94 |
+
course_section += f"**Duration:** {result['course_time']} hours\n"
|
|
|
|
| 95 |
|
| 96 |
# Format key takeaways with bullet points
|
| 97 |
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
|
|
|
|
| 101 |
for takeaway in takeaways:
|
| 102 |
cleaned = takeaway.strip('. ,')
|
| 103 |
if cleaned:
|
|
|
|
| 104 |
if len(cleaned) > 100:
|
| 105 |
cleaned = cleaned[:97] + "..."
|
| 106 |
formatted_takeaways.append(f"โข {cleaned}")
|
| 107 |
+
course_section += "\n".join(formatted_takeaways[:3])
|
| 108 |
|
| 109 |
if len(takeaways) > 3:
|
| 110 |
+
course_section += "\nโข And more..."
|
| 111 |
|
| 112 |
# Add relevance score as a percentage
|
| 113 |
similarity_percentage = int(result['similarity_score'] * 100)
|
| 114 |
+
course_section += f"\n**Match Score:** {similarity_percentage}%"
|
| 115 |
|
| 116 |
+
# Add course link
|
| 117 |
+
course_section += f"\n\n[Start Course]({result['url']})\n"
|
| 118 |
|
| 119 |
response_parts.append(course_section)
|
| 120 |
|
| 121 |
+
# Add helpful conclusion
|
| 122 |
response_parts.append("\n---\n")
|
| 123 |
+
response_parts.append("**Notes:**")
|
| 124 |
response_parts.append("โข Courses are sorted by relevance to your search")
|
| 125 |
response_parts.append("โข All courses are free and include hands-on projects")
|
| 126 |
response_parts.append("โข Certificates are provided upon completion")
|
|
|
|
| 129 |
|
| 130 |
def search_courses(self, query: str, top_k: int = 5) -> str:
|
| 131 |
"""Search for courses and return formatted response"""
|
|
|
|
| 132 |
query = self.preprocess_text(query)
|
|
|
|
|
|
|
| 133 |
query_embedding = self.get_embeddings([query])[0]
|
|
|
|
|
|
|
| 134 |
similarities = np.dot(self.course_embeddings, query_embedding)
|
|
|
|
|
|
|
| 135 |
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
| 136 |
|
| 137 |
results = []
|
|
|
|
| 139 |
course = self.courses_df.iloc[idx]
|
| 140 |
results.append({
|
| 141 |
'course_name': course['Course Name'],
|
| 142 |
+
'key_takeaways': course['Key_Takeaways'],
|
| 143 |
'course_time': course['Course Time'],
|
| 144 |
'ratings': course['Ratings'],
|
| 145 |
'difficulty': course['Difficulty'],
|
|
|
|
| 147 |
'url': course['Website']
|
| 148 |
})
|
| 149 |
|
| 150 |
+
return self.generate_response(query, results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|