File size: 2,339 Bytes
4c971f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import sqlite3
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 載入JSON數據
def load_data(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return pd.DataFrame(data)

# 初始化資料庫
def init_db(db_name='books.db'):
    conn = sqlite3.connect(db_name)
    df = load_data('data.json')  # 假設JSON命名為data.json
    df.to_sql('books', conn, if_exists='replace', index=False)
    conn.close()
    print("資料庫初始化完成!")

# 簡單推薦函數(基於內容簡介的TF-IDF,優化中文處理)
def recommend_books(query, top_n=5):
    conn = sqlite3.connect('books.db')
    df = pd.read_sql_query("SELECT * FROM books", conn)
    conn.close()
    
    if df.empty or 'content_intro' not in df.columns:
        return pd.DataFrame()
    
    # 預處理:合併 title、author 與 content_intro,提升相關性
    df['text'] = (df['title'].fillna('') + ' ' + df['author'].fillna('') + ' ' + df['content_intro'].fillna('')).astype(str)
    query = re.sub(r'[^\w\s]', ' ', query.lower())  # 移除標點,簡化查詢
    
    # TF-IDF向量化(中文優化:使用 ngrams 捕捉詞組,無需分詞)
    vectorizer = TfidfVectorizer(
        max_features=2000,
        ngram_range=(1, 3),  # 單詞、雙詞、三詞組合,適合中文
        min_df=1,            # 最小文件頻率
        max_df=0.95,         # 最大文件頻率(過濾過常見詞)
        lowercase=True
    )
    tfidf_matrix = vectorizer.fit_transform(df['text'].values)
    
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    df['similarity'] = similarities
    
    # 若所有相似度為0,使用關鍵字匹配 fallback
    if similarities.max() == 0:
        df['keyword_score'] = df['text'].str.contains(query, case=False, na=False).astype(int)
        df['similarity'] = df['keyword_score']  # 簡單計數匹配
    
    # 回傳推薦結果包含 product_id,並依相似度排序
    recommendations = df.nlargest(top_n, 'similarity')[['product_id', 'title', 'author', 'discount_price', 'similarity']]
    return recommendations

if __name__ == "__main__":
    init_db()