campusBooks / database.py
Song
hi
4c971f7
import json
import sqlite3
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 載入JSON數據
def load_data(json_file):
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return pd.DataFrame(data)
# 初始化資料庫
def init_db(db_name='books.db'):
conn = sqlite3.connect(db_name)
df = load_data('data.json') # 假設JSON命名為data.json
df.to_sql('books', conn, if_exists='replace', index=False)
conn.close()
print("資料庫初始化完成!")
# 簡單推薦函數(基於內容簡介的TF-IDF,優化中文處理)
def recommend_books(query, top_n=5):
conn = sqlite3.connect('books.db')
df = pd.read_sql_query("SELECT * FROM books", conn)
conn.close()
if df.empty or 'content_intro' not in df.columns:
return pd.DataFrame()
# 預處理:合併 title、author 與 content_intro,提升相關性
df['text'] = (df['title'].fillna('') + ' ' + df['author'].fillna('') + ' ' + df['content_intro'].fillna('')).astype(str)
query = re.sub(r'[^\w\s]', ' ', query.lower()) # 移除標點,簡化查詢
# TF-IDF向量化(中文優化:使用 ngrams 捕捉詞組,無需分詞)
vectorizer = TfidfVectorizer(
max_features=2000,
ngram_range=(1, 3), # 單詞、雙詞、三詞組合,適合中文
min_df=1, # 最小文件頻率
max_df=0.95, # 最大文件頻率(過濾過常見詞)
lowercase=True
)
tfidf_matrix = vectorizer.fit_transform(df['text'].values)
query_vec = vectorizer.transform([query])
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
df['similarity'] = similarities
# 若所有相似度為0,使用關鍵字匹配 fallback
if similarities.max() == 0:
df['keyword_score'] = df['text'].str.contains(query, case=False, na=False).astype(int)
df['similarity'] = df['keyword_score'] # 簡單計數匹配
# 回傳推薦結果包含 product_id,並依相似度排序
recommendations = df.nlargest(top_n, 'similarity')[['product_id', 'title', 'author', 'discount_price', 'similarity']]
return recommendations
if __name__ == "__main__":
init_db()