File size: 6,665 Bytes
2820d25
 
 
 
6270545
 
2820d25
 
 
6270545
 
 
 
 
bcf13d3
6270545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2820d25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6270545
d77165f
2820d25
a8b3f9e
2820d25
 
a8b3f9e
d77165f
a8b3f9e
d77165f
a8b3f9e
2820d25
 
 
d77165f
a8b3f9e
2820d25
a8b3f9e
2820d25
d77165f
a8b3f9e
d77165f
a8b3f9e
 
2820d25
d77165f
2820d25
a8b3f9e
2820d25
d77165f
2820d25
d77165f
2820d25
d77165f
2820d25
d77165f
 
 
 
 
a8b3f9e
d77165f
 
a8b3f9e
2820d25
d77165f
 
a8b3f9e
d77165f
a8b3f9e
 
2820d25
 
 
a8b3f9e
d77165f
a8b3f9e
d77165f
 
 
2820d25
 
 
 
 
 
6270545
2820d25
 
 
 
 
 
 
 
f0fefa4
2820d25
 
 
 
 
 
 
a8b3f9e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import re
import torch
from transformers import AutoModel, AutoTokenizer

class CourseSearchSystem:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'sentence-transformers/all-MiniLM-L6-v2'
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
        self.model.eval()
            
    def mean_pooling(self, model_output, attention_mask):
        """Mean pooling to get sentence embeddings"""
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """Get embeddings for a list of texts"""
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}

        with torch.no_grad():
            model_output = self.model(**encoded_input)

        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        
        return sentence_embeddings.cpu().numpy()

    def preprocess_text(self, text: str) -> str:
        """Clean and standardize text data"""
        if pd.isna(text):
            return ""
        text = str(text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = ' '.join(text.split())
        return text.lower()
    
    def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare and clean course data"""
        free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)]
        
        free_courses = free_courses.fillna({
            'Course Time': 0,
            'Ratings': 4.6,
            'Difficulty': 'Beginner',
            'Key Takeaways': 'Course details not available.'
        })
        
        free_courses['search_text'] = free_courses.apply(
            lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}", 
            axis=1
        )
        
        free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text)
        
        return free_courses
    
    def load_and_prepare_data(self, df: pd.DataFrame):
        """Load and prepare the course data and generate embeddings"""
        self.courses_df = self.prepare_course_data(df)
        self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist())

    def generate_response(self, query: str, results: List[Dict]) -> str:
        """Generate a professional response with course recommendations"""
        response_parts = []
        
        # Introduction based on number of results
        if len(results) == 1:
            response_parts.append(f"I found an excellent free course matching your search for '{query}':")
        else:
            response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':")
        
        # Course details
        for i, result in enumerate(results, 1):
            course_name = result['course_name']
            course_section = f"\n**{i}. {course_name}**\n"
            
            # Clean rating display
            rating = result['ratings']
            rating_display = f"{rating}/5.0"
            course_section += f"**Rating:** {rating_display}\n"
            
            # Add difficulty
            course_section += f"**Level:** {result['difficulty']}\n"
            
            # Add duration if available
            if result['course_time']:
                course_section += f"**Duration:** {result['course_time']} hours\n"
            
            # Format key takeaways with bullet points
            if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
                course_section += "\n**What you'll learn:**\n"
                takeaways = result['key_takeaways'].split('.,')
                formatted_takeaways = []
                for takeaway in takeaways:
                    cleaned = takeaway.strip('. ,')
                    if cleaned:
                        if len(cleaned) > 100:
                            cleaned = cleaned[:97] + "..."
                        formatted_takeaways.append(f"• {cleaned}")
                course_section += "\n".join(formatted_takeaways[:3])
                
                if len(takeaways) > 3:
                    course_section += "\n• And more..."
            
            # Add relevance score as a percentage
            similarity_percentage = int(result['similarity_score'] * 100)
            course_section += f"\n**Match Score:** {similarity_percentage}%"
            
            # Add course link
            course_section += f"\n\n[Start Course]({result['url']})\n"
            
            response_parts.append(course_section)
        
        # Add helpful conclusion
        response_parts.append("\n---\n")
        response_parts.append("**Notes:**")
        response_parts.append("• Courses are sorted by relevance to your search")
        response_parts.append("• All courses are free and include hands-on projects")
        response_parts.append("• Certificates are provided upon completion")
        
        return "\n".join(response_parts)
    
    def search_courses(self, query: str, top_k: int = 5) -> str:
        """Search for courses and return formatted response"""
        query = self.preprocess_text(query)
        query_embedding = self.get_embeddings([query])[0]
        similarities = np.dot(self.course_embeddings, query_embedding)
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            course = self.courses_df.iloc[idx]
            results.append({
                'course_name': course['Course Name'],
                'key_takeaways': course['Key Takeaways'],  # Fixed: Changed from Key_Takeaways to Key Takeaways
                'course_time': course['Course Time'],
                'ratings': course['Ratings'],
                'difficulty': course['Difficulty'],
                'similarity_score': similarities[idx],
                'url': course['Website']
            })
        
        return self.generate_response(query, results)