Spaces:

yukarinw
/

Travel_AI

Sleeping

App Files Files Community

ayayaya12 commited on Jun 22, 2025

Commit

795f8d5

1 Parent(s): 04dc4d1

Update Recommend API

Browse files

Files changed (2) hide show

src/prompts.py +2 -2
src/recommendation_api.py +296 -56

src/prompts.py CHANGED Viewed

@@ -110,8 +110,8 @@ Câu trả lời của bạn:
 response_gen_prompt = ChatPromptTemplate.from_messages(
     [
         ("system", response_gen_template_string),
-        MessagesPlaceholder(variable_name="chat_history_messages"), # Nơi chèn lịch sử chat
-        ("human", "Thông tin tìm kiếm được (nếu có liên quan đến câu hỏi cuối cùng):\n{search_results}\n\nCâu hỏi cuối cùng của người dùng: {user_query}"), # Đặt câu hỏi và kết quả cuối cùng dạng human
     ]
 )

 response_gen_prompt = ChatPromptTemplate.from_messages(
     [
         ("system", response_gen_template_string),
+        MessagesPlaceholder(variable_name="chat_history_messages"),
+        ("human", "Thông tin tìm kiếm được (nếu có liên quan đến câu hỏi cuối cùng):\n{search_results}\n\nCâu hỏi cuối cùng của người dùng: {user_query}"),
     ]
 )

src/recommendation_api.py CHANGED Viewed

@@ -7,6 +7,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from pydantic import BaseModel, Field
 class TourRecommendationRequest(BaseModel):
     user_id: Optional[int] = Field(None)
@@ -25,37 +27,121 @@ class TourSummary(BaseModel):
 class TourRecommendationResponse(BaseModel):
     recommendations: List[TourSummary]
-    recommendation_type: str = "content-based"
 class ContentBasedRecommender:
     def __init__(self, conn):
         self.conn = conn
         self.vectorizer = TfidfVectorizer(
-            max_features=5000,
-            stop_words='english',
-            ngram_range=(1, 2)
         )
         self.field_weights = {
-            'title': 0.25,
-            'description': 0.20,
-            'destination': 0.25,
-            'departure_location': 0.15,
-            'region': 0.10,
-            'itinerary': 0.05
         }
     def preprocess_text(self, text):
         if not text:
             return ""
         text = str(text).lower()
-        text = re.sub(r'[^\w\s]', ' ', text)
         text = re.sub(r'\s+', ' ', text).strip()
-        return text
     def preprocess_list(self, items):
         if not items:
             return ""
-        return " ".join([self.preprocess_text(item) for item in items])
     def preprocess_json(self, json_data):
         if not json_data:
@@ -65,19 +151,31 @@ class ContentBasedRecommender:
                 data = json.loads(json_data)
             else:
                 data = json_data
             text_values = []
             def extract_values(obj):
                 if isinstance(obj, dict):
-                    for val in obj.values():
-                        extract_values(val)
                 elif isinstance(obj, list):
                     for item in obj:
                         extract_values(item)
-                elif obj:
-                    text_values.append(str(obj))
             extract_values(data)
             return " ".join(text_values)
-        except:
             return ""
     def get_all_tours(self):
@@ -91,11 +189,20 @@ class ContentBasedRecommender:
                     t.description,
                     t.destination,
                     t.region,
-                    t.itinerary
                 FROM
                     Tour t
                 WHERE
                     t.availability = true
             """)
             return cursor.fetchall()
@@ -103,15 +210,19 @@ class ContentBasedRecommender:
         with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
             cursor.execute("""
                 SELECT
-                    h.tour_id
                 FROM
                     History h
                 WHERE
                     h.user_id = %s
                 GROUP BY
                     h.tour_id
             """, (user_id,))
-            return [row['tour_id'] for row in cursor.fetchall()]
     def get_tour_by_id(self, tour_id):
         with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
@@ -124,63 +235,153 @@ class ContentBasedRecommender:
                     t.description,
                     t.destination,
                     t.region,
-                    t.itinerary
                 FROM
                     Tour t
                 WHERE
                     t.tour_id = %s
             """, (tour_id,))
             return cursor.fetchone()
     def create_tour_features(self, tours):
         tour_features = {}
         for tour in tours:
-            title = self.preprocess_text(tour['title'])
-            description = self.preprocess_text(tour['description'])
-            departure_location = self.preprocess_text(tour['departure_location'])
-            destination = self.preprocess_list(tour['destination'])
-            region = self.preprocess_text(str(tour['region']))
-            itinerary = self.preprocess_json(tour['itinerary'])
             combined_features = (
-                f"{title} " * int(self.field_weights['title'] * 10) +
-                f"{description} " * int(self.field_weights['description'] * 10) +
-                f"{destination} " * int(self.field_weights['destination'] * 10) +
-                f"{departure_location} " * int(self.field_weights['departure_location'] * 10) +
-                f"{region} " * int(self.field_weights['region'] * 10) +
-                f"{itinerary} " * int(self.field_weights['itinerary'] * 10)
             )
-            tour_features[tour['tour_id']] = combined_features
         return tour_features
-    def calculate_similarity(self, tour_features):
         tour_ids = list(tour_features.keys())
         feature_texts = [tour_features[tour_id] for tour_id in tour_ids]
-        tfidf_matrix = self.vectorizer.fit_transform(feature_texts)
-        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
         similarity_dict = {}
         for i, tour_id in enumerate(tour_ids):
-            similarity_dict[tour_id] = {
-                tour_ids[j]: cosine_sim[i][j] for j in range(len(tour_ids)) if i != j
-            }
         return similarity_dict
     def recommend_similar_tours(self, tour_id, limit=3):
         all_tours = self.get_all_tours()
         target_tour = None
         for tour in all_tours:
             if tour.get('tour_id') == tour_id:
                 target_tour = tour
                 break
         if not target_tour:
             return []
-        tour_features = self.create_tour_features(all_tours)
-        similarity_dict = self.calculate_similarity(tour_features)
         if tour_id in similarity_dict:
             similar_tours = sorted(
                 similarity_dict[tour_id].items(),
                 key=lambda x: x[1],
                 reverse=True
             )[:limit]
             recommended_tours = []
             for similar_tour_id, similarity_score in similar_tours:
                 for tour in all_tours:
@@ -189,34 +390,65 @@ class ContentBasedRecommender:
                         tour_copy['similarity_score'] = float(similarity_score)
                         recommended_tours.append(tour_copy)
                         break
             return recommended_tours
         return []
     def recommend_for_user(self, user_id, limit=3):
         user_history = self.get_user_history(user_id)
         if not user_history:
             return self.recommend_popular_tours(limit)
         all_tours = self.get_all_tours()
-        tour_features = self.create_tour_features(all_tours)
-        similarity_dict = self.calculate_similarity(tour_features)
         tour_scores = {}
         for tour in all_tours:
             tour_id = tour.get('tour_id')
-            if tour_id is None or tour_id in user_history:
                 continue
             total_similarity = 0
-            count = 0
-            for history_tour_id in user_history:
-                if history_tour_id in similarity_dict and tour_id in similarity_dict[history_tour_id]:
-                    total_similarity += similarity_dict[history_tour_id][tour_id]
-                    count += 1
-            if count > 0:
-                tour_scores[tour_id] = total_similarity / count
         top_tours = sorted(
             tour_scores.items(),
             key=lambda x: x[1],
             reverse=True
         )[:limit]
         recommended_tours = []
         for tour_id, similarity_score in top_tours:
             for tour in all_tours:
@@ -225,6 +457,7 @@ class ContentBasedRecommender:
                     tour_copy['similarity_score'] = float(similarity_score)
                     recommended_tours.append(tour_copy)
                     break
         return recommended_tours
     def recommend_popular_tours(self, limit=3):
@@ -238,21 +471,28 @@ class ContentBasedRecommender:
                     t.description,
                     t.destination,
                     t.region,
-                    COUNT(b.booking_id) as booking_count
                 FROM
                     Tour t
                 LEFT JOIN
                     Departure d ON t.tour_id = d.tour_id
                 LEFT JOIN
                     Booking b ON d.departure_id = b.departure_id
                 WHERE
                     t.availability = true
                 GROUP BY
-                    t.tour_id
                 ORDER BY
-                    booking_count DESC
                 LIMIT %s
             """, (limit,))
             popular_tours = cursor.fetchall()
             for tour in popular_tours:
                 tour['similarity_score'] = None

 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from pydantic import BaseModel, Field
+from bs4 import BeautifulSoup
+import math
 class TourRecommendationRequest(BaseModel):
     user_id: Optional[int] = Field(None)
 class TourRecommendationResponse(BaseModel):
     recommendations: List[TourSummary]
 class ContentBasedRecommender:
     def __init__(self, conn):
         self.conn = conn
+        vietnamese_stop_words = [
+            "và", "là", "của", "trong", "được", "có", "không", "cho", "với",
+            "tại", "bằng", "để", "này", "khi", "một", "những", "các", "đã",
+            "rồi", "lại", "nếu", "vì", "thì", "từ", "ra", "đến", "trên", "dưới",
+            "quý", "khách", "tham", "quan", "du", "lịch", "tour", "ngày", "đêm",
+            "ăn", "sáng", "trưa", "tối", "nghỉ", "khách", "sạn", "tự", "túc"
+        ]
         self.vectorizer = TfidfVectorizer(
+            max_features=8000,
+            stop_words=vietnamese_stop_words,
+            ngram_range=(1, 3),
+            min_df=1,
+            max_df=0.8,
+            token_pattern=r'[a-zA-ZÀ-ỹ]+',
+            lowercase=True
         )
         self.field_weights = {
+            'title': 0.20,
+            'destination': 0.30,
+            'description': 0.15,
+            'departure_location': 0.10,
+            'region': 0.15,
+            'itinerary': 0.10,
+            'duration': 0.05,
+            'attractions': 0.15
         }
+        self.region_proximity = {
+            1: {1: 1.0, 2: 0.6, 3: 0.3},
+            2: {1: 0.6, 2: 1.0, 3: 0.7},
+            3: {1: 0.3, 2: 0.7, 3: 1.0}
+        }
+    def clean_html(self, text):
+        if not text:
+            return ""
+        try:
+            soup = BeautifulSoup(text, 'html.parser')
+            clean_text = soup.get_text()
+            clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+            return clean_text
+        except:
+            return str(text)
     def preprocess_text(self, text):
         if not text:
             return ""
+        text = self.clean_html(text)
         text = str(text).lower()
+        text = re.sub(r'[^\w\sÀ-ỹ]', ' ', text)
         text = re.sub(r'\s+', ' ', text).strip()
+        words = text.split()
+        words = [word for word in words if len(word) >= 2]
+        return " ".join(words)
     def preprocess_list(self, items):
         if not items:
             return ""
+        processed_items = []
+        for item in items:
+            cleaned = self.preprocess_text(item)
+            if cleaned:
+                processed_items.append(cleaned)
+        return " ".join(processed_items)
+    def extract_attractions_from_itinerary(self, itinerary):
+        if not itinerary:
+            return ""
+        try:
+            if isinstance(itinerary, str):
+                data = json.loads(itinerary)
+            else:
+                data = itinerary
+            attractions = []
+            if isinstance(data, list):
+                for day in data:
+                    if isinstance(day, dict):
+                        description = day.get('description', '')
+                        if description:
+                            clean_desc = self.clean_html(description)
+                            soup = BeautifulSoup(description, 'html.parser')
+                            strong_tags = soup.find_all('strong')
+                            for tag in strong_tags:
+                                attractions.append(tag.get_text())
+                            colored_spans = soup.find_all('span', style=lambda x: x and 'color' in x)
+                            for span in colored_spans:
+                                attractions.append(span.get_text())
+            clean_attractions = []
+            for attraction in attractions:
+                cleaned = self.preprocess_text(attraction)
+                if cleaned and len(cleaned) > 3:
+                    clean_attractions.append(cleaned)
+            return " ".join(clean_attractions)
+        except Exception as e:
+            print(f"Error extracting attractions: {e}")
+            return ""
     def preprocess_json(self, json_data):
         if not json_data:
                 data = json.loads(json_data)
             else:
                 data = json_data
             text_values = []
             def extract_values(obj):
                 if isinstance(obj, dict):
+                    for key, val in obj.items():
+                        if key.lower() in ['title', 'description', 'name', 'location']:
+                            if val:
+                                clean_val = self.clean_html(str(val))
+                                if clean_val:
+                                    text_values.append(clean_val)
+                        else:
+                            extract_values(val)
                 elif isinstance(obj, list):
                     for item in obj:
                         extract_values(item)
+                elif obj and len(str(obj)) > 3:
+                    clean_val = self.clean_html(str(obj))
+                    if clean_val:
+                        text_values.append(clean_val)
             extract_values(data)
             return " ".join(text_values)
+        except Exception as e:
+            print(f"Error preprocessing JSON: {e}")
             return ""
     def get_all_tours(self):
                     t.description,
                     t.destination,
                     t.region,
+                    t.itinerary,
+                    t.max_participants,
+                    MIN(d.price_adult) as min_price,
+                    MAX(d.price_adult) as max_price,
+                    AVG(d.price_adult) as avg_price
                 FROM
                     Tour t
+                LEFT JOIN
+                    Departure d ON t.tour_id = d.tour_id AND d.availability = true
                 WHERE
                     t.availability = true
+                GROUP BY
+                    t.tour_id, t.title, t.duration, t.departure_location,
+                    t.description, t.destination, t.region, t.itinerary, t.max_participants
             """)
             return cursor.fetchall()
         with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
             cursor.execute("""
                 SELECT
+                    h.tour_id,
+                    COUNT(*) as interaction_count,
+                    MAX(h.timestamp) as last_interaction
                 FROM
                     History h
                 WHERE
                     h.user_id = %s
                 GROUP BY
                     h.tour_id
+                ORDER BY
+                    interaction_count DESC, last_interaction DESC
             """, (user_id,))
+            return cursor.fetchall()
     def get_tour_by_id(self, tour_id):
         with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
                     t.description,
                     t.destination,
                     t.region,
+                    t.itinerary,
+                    t.max_participants,
+                    MIN(d.price_adult) as min_price,
+                    MAX(d.price_adult) as max_price,
+                    AVG(d.price_adult) as avg_price
                 FROM
                     Tour t
+                LEFT JOIN
+                    Departure d ON t.tour_id = d.tour_id AND d.availability = true
                 WHERE
                     t.tour_id = %s
+                GROUP BY
+                    t.tour_id, t.title, t.duration, t.departure_location,
+                    t.description, t.destination, t.region, t.itinerary, t.max_participants
             """, (tour_id,))
             return cursor.fetchone()
+    def extract_duration_days(self, duration):
+        if not duration:
+            return 0
+        numbers = re.findall(r'\d+', duration)
+        if numbers:
+            return int(numbers[0])
+        return 0
+    def calculate_price_similarity(self, price1, price2):
+        if not price1 or not price2:
+            return 0.5
+        price1 = float(price1)
+        price2 = float(price2)
+        max_price = max(price1, price2)
+        min_price = min(price1, price2)
+        if max_price == 0:
+            return 1.0
+        ratio = min_price / max_price
+        return ratio
     def create_tour_features(self, tours):
         tour_features = {}
         for tour in tours:
+            title = self.preprocess_text(tour.get('title', ''))
+            description = self.preprocess_text(tour.get('description', ''))
+            departure_location = self.preprocess_text(tour.get('departure_location', ''))
+            destination = self.preprocess_list(tour.get('destination', []))
+            region = self.preprocess_text(str(tour.get('region', '')))
+            duration = self.preprocess_text(tour.get('duration', ''))
+            itinerary = self.preprocess_json(tour.get('itinerary'))
+            attractions = self.extract_attractions_from_itinerary(tour.get('itinerary'))
             combined_features = (
+                f"{title} " * int(self.field_weights['title'] * 20) +
+                f"{destination} " * int(self.field_weights['destination'] * 20) +
+                f"{description} " * int(self.field_weights['description'] * 20) +
+                f"{departure_location} " * int(self.field_weights['departure_location'] * 20) +
+                f"{region} " * int(self.field_weights['region'] * 20) +
+                f"{itinerary} " * int(self.field_weights['itinerary'] * 20) +
+                f"{duration} " * int(self.field_weights['duration'] * 20) +
+                f"{attractions} " * int(self.field_weights['attractions'] * 20)
             )
+            tour_features[tour['tour_id']] = combined_features.strip()
         return tour_features
+    def calculate_enhanced_similarity(self, tours):
+        tour_features = self.create_tour_features(tours)
         tour_ids = list(tour_features.keys())
         feature_texts = [tour_features[tour_id] for tour_id in tour_ids]
+        if not feature_texts or all(not text.strip() for text in feature_texts):
+            return {}
+        try:
+            tfidf_matrix = self.vectorizer.fit_transform(feature_texts)
+            text_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
+        except Exception as e:
+            print(f"Error in TF-IDF calculation: {e}")
+            return {}
+        tour_lookup = {tour['tour_id']: tour for tour in tours}
         similarity_dict = {}
         for i, tour_id in enumerate(tour_ids):
+            similarity_dict[tour_id] = {}
+            tour_i = tour_lookup[tour_id]
+            for j, other_tour_id in enumerate(tour_ids):
+                if i == j:
+                    continue
+                tour_j = tour_lookup[other_tour_id]
+                text_sim = text_similarity[i][j]
+                region_i = tour_i.get('region', 1)
+                region_j = tour_j.get('region', 1)
+                region_sim = self.region_proximity.get(region_i, {}).get(region_j, 0.3)
+                duration_i = self.extract_duration_days(tour_i.get('duration'))
+                duration_j = self.extract_duration_days(tour_j.get('duration'))
+                duration_sim = 1.0 if duration_i == duration_j else 0.7 if abs(duration_i - duration_j) <= 1 else 0.3
+                price_i = tour_i.get('avg_price')
+                price_j = tour_j.get('avg_price')
+                price_sim = self.calculate_price_similarity(price_i, price_j)
+                final_similarity = (
+                    text_sim * 0.6 +
+                    region_sim * 0.2 +
+                    duration_sim * 0.1 +
+                    price_sim * 0.1
+                )
+                similarity_dict[tour_id][other_tour_id] = final_similarity
         return similarity_dict
     def recommend_similar_tours(self, tour_id, limit=3):
         all_tours = self.get_all_tours()
         target_tour = None
         for tour in all_tours:
             if tour.get('tour_id') == tour_id:
                 target_tour = tour
                 break
         if not target_tour:
             return []
+        similarity_dict = self.calculate_enhanced_similarity(all_tours)
         if tour_id in similarity_dict:
             similar_tours = sorted(
                 similarity_dict[tour_id].items(),
                 key=lambda x: x[1],
                 reverse=True
             )[:limit]
             recommended_tours = []
             for similar_tour_id, similarity_score in similar_tours:
                 for tour in all_tours:
                         tour_copy['similarity_score'] = float(similarity_score)
                         recommended_tours.append(tour_copy)
                         break
             return recommended_tours
         return []
     def recommend_for_user(self, user_id, limit=3):
         user_history = self.get_user_history(user_id)
         if not user_history:
             return self.recommend_popular_tours(limit)
         all_tours = self.get_all_tours()
+        similarity_dict = self.calculate_enhanced_similarity(all_tours)
         tour_scores = {}
+        total_interactions = sum(h['interaction_count'] for h in user_history)
         for tour in all_tours:
             tour_id = tour.get('tour_id')
+            if tour_id is None or any(h['tour_id'] == tour_id for h in user_history):
                 continue
             total_similarity = 0
+            total_weight = 0
+            for history_item in user_history:
+                history_tour_id = history_item['tour_id']
+                interaction_weight = history_item['interaction_count'] / total_interactions
+                if (history_tour_id in similarity_dict and
+                    tour_id in similarity_dict[history_tour_id]):
+                    similarity = similarity_dict[history_tour_id][tour_id]
+                    total_similarity += similarity * interaction_weight
+                    total_weight += interaction_weight
+            if total_weight > 0:
+                tour_scores[tour_id] = total_similarity / total_weight
+        user_regions = set()
+        for history_item in user_history:
+            for tour in all_tours:
+                if tour['tour_id'] == history_item['tour_id']:
+                    user_regions.add(tour.get('region'))
+                    break
+        for tour_id, score in tour_scores.items():
+            for tour in all_tours:
+                if tour['tour_id'] == tour_id:
+                    if tour.get('region') not in user_regions:
+                        tour_scores[tour_id] = score * 1.1
+                    break
         top_tours = sorted(
             tour_scores.items(),
             key=lambda x: x[1],
             reverse=True
         )[:limit]
         recommended_tours = []
         for tour_id, similarity_score in top_tours:
             for tour in all_tours:
                     tour_copy['similarity_score'] = float(similarity_score)
                     recommended_tours.append(tour_copy)
                     break
         return recommended_tours
     def recommend_popular_tours(self, limit=3):
                     t.description,
                     t.destination,
                     t.region,
+                    COUNT(DISTINCT b.booking_id) as booking_count,
+                    AVG(r.average_rating) as avg_rating,
+                    COUNT(DISTINCT r.review_id) as review_count
                 FROM
                     Tour t
                 LEFT JOIN
                     Departure d ON t.tour_id = d.tour_id
                 LEFT JOIN
                     Booking b ON d.departure_id = b.departure_id
+                LEFT JOIN
+                    Review r ON t.tour_id = r.tour_id
                 WHERE
                     t.availability = true
                 GROUP BY
+                    t.tour_id, t.title, t.duration, t.departure_location,
+                    t.description, t.destination, t.region
                 ORDER BY
+                    (COUNT(DISTINCT b.booking_id) * 0.6 +
+                     COALESCE(AVG(r.average_rating), 3.0) * COUNT(DISTINCT r.review_id) * 0.4) DESC
                 LIMIT %s
             """, (limit,))
             popular_tours = cursor.fetchall()
             for tour in popular_tours:
                 tour['similarity_score'] = None