Spaces:
Sleeping
Sleeping
File size: 6,665 Bytes
2820d25 6270545 2820d25 6270545 bcf13d3 6270545 2820d25 6270545 d77165f 2820d25 a8b3f9e 2820d25 a8b3f9e d77165f a8b3f9e d77165f a8b3f9e 2820d25 d77165f a8b3f9e 2820d25 a8b3f9e 2820d25 d77165f a8b3f9e d77165f a8b3f9e 2820d25 d77165f 2820d25 a8b3f9e 2820d25 d77165f 2820d25 d77165f 2820d25 d77165f 2820d25 d77165f a8b3f9e d77165f a8b3f9e 2820d25 d77165f a8b3f9e d77165f a8b3f9e 2820d25 a8b3f9e d77165f a8b3f9e d77165f 2820d25 6270545 2820d25 f0fefa4 2820d25 a8b3f9e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import re
import torch
from transformers import AutoModel, AutoTokenizer
class CourseSearchSystem:
def __init__(self):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
self.model.eval()
def mean_pooling(self, model_output, attention_mask):
"""Mean pooling to get sentence embeddings"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def get_embeddings(self, texts: List[str]) -> np.ndarray:
"""Get embeddings for a list of texts"""
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
return sentence_embeddings.cpu().numpy()
def preprocess_text(self, text: str) -> str:
"""Clean and standardize text data"""
if pd.isna(text):
return ""
text = str(text)
text = re.sub(r'[^\w\s]', ' ', text)
text = ' '.join(text.split())
return text.lower()
def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare and clean course data"""
free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)]
free_courses = free_courses.fillna({
'Course Time': 0,
'Ratings': 4.6,
'Difficulty': 'Beginner',
'Key Takeaways': 'Course details not available.'
})
free_courses['search_text'] = free_courses.apply(
lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}",
axis=1
)
free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text)
return free_courses
def load_and_prepare_data(self, df: pd.DataFrame):
"""Load and prepare the course data and generate embeddings"""
self.courses_df = self.prepare_course_data(df)
self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())
def generate_response(self, query: str, results: List[Dict]) -> str:
"""Generate a professional response with course recommendations"""
response_parts = []
# Introduction based on number of results
if len(results) == 1:
response_parts.append(f"I found an excellent free course matching your search for '{query}':")
else:
response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':")
# Course details
for i, result in enumerate(results, 1):
course_name = result['course_name']
course_section = f"\n**{i}. {course_name}**\n"
# Clean rating display
rating = result['ratings']
rating_display = f"{rating}/5.0"
course_section += f"**Rating:** {rating_display}\n"
# Add difficulty
course_section += f"**Level:** {result['difficulty']}\n"
# Add duration if available
if result['course_time']:
course_section += f"**Duration:** {result['course_time']} hours\n"
# Format key takeaways with bullet points
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
course_section += "\n**What you'll learn:**\n"
takeaways = result['key_takeaways'].split('.,')
formatted_takeaways = []
for takeaway in takeaways:
cleaned = takeaway.strip('. ,')
if cleaned:
if len(cleaned) > 100:
cleaned = cleaned[:97] + "..."
formatted_takeaways.append(f"• {cleaned}")
course_section += "\n".join(formatted_takeaways[:3])
if len(takeaways) > 3:
course_section += "\n• And more..."
# Add relevance score as a percentage
similarity_percentage = int(result['similarity_score'] * 100)
course_section += f"\n**Match Score:** {similarity_percentage}%"
# Add course link
course_section += f"\n\n[Start Course]({result['url']})\n"
response_parts.append(course_section)
# Add helpful conclusion
response_parts.append("\n---\n")
response_parts.append("**Notes:**")
response_parts.append("• Courses are sorted by relevance to your search")
response_parts.append("• All courses are free and include hands-on projects")
response_parts.append("• Certificates are provided upon completion")
return "\n".join(response_parts)
def search_courses(self, query: str, top_k: int = 5) -> str:
"""Search for courses and return formatted response"""
query = self.preprocess_text(query)
query_embedding = self.get_embeddings([query])[0]
similarities = np.dot(self.course_embeddings, query_embedding)
top_indices = np.argsort(similarities)[-top_k:][::-1]
results = []
for idx in top_indices:
course = self.courses_df.iloc[idx]
results.append({
'course_name': course['Course Name'],
'key_takeaways': course['Key Takeaways'], # Fixed: Changed from Key_Takeaways to Key Takeaways
'course_time': course['Course Time'],
'ratings': course['Ratings'],
'difficulty': course['Difficulty'],
'similarity_score': similarities[idx],
'url': course['Website']
})
return self.generate_response(query, results) |