File size: 4,001 Bytes
bb78694
 
f17e48b
 
 
bb78694
f17e48b
 
bb78694
f17e48b
bb78694
 
 
 
 
 
 
 
 
 
b5ffc72
bb78694
 
 
 
 
 
 
 
 
 
 
 
8fe9808
b5ffc72
bb78694
 
 
 
 
 
 
 
 
4505c1f
bb78694
4505c1f
bb78694
b5ffc72
bb78694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4505c1f
bb78694
 
 
 
 
8fe9808
bb78694
 
8fe9808
bb78694
 
 
8fe9808
bb78694
 
 
8fe9808
bb78694
 
 
 
 
 
 
 
 
 
8fe9808
4505c1f
bb78694
 
 
b5ffc72
bb78694
 
 
8fe9808
bb78694
 
 
 
 
 
 
 
8fe9808
bb78694
 
 
 
 
b5ffc72
bb78694
 
 
 
b5ffc72
 
bb78694
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# app.py
from datasets import load_dataset
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import re

# 全局配置
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # 更强大的语义模型
SUMMARIZER_NAME = "facebook/bart-large-cnn"
DATASET_NAME = "bookcorpus"
CACHE_DIR = "./data-cache"

# 预加载资源
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
summarizer = pipeline("summarization", SUMMARIZER_NAME)

# 加载并预处理书籍数据
def load_books():
    dataset = load_dataset(DATASET_NAME, split='train', streaming=True)
    books = []
    for book in dataset.take(50000):  # 取5万本书
        text = book['text'].strip()
        if len(text) > 500:  # 过滤短文本
            title = re.findall(r'"([^"]*)"', text[:200])  # 尝试提取标题
            books.append({
                "text": text,
                "title": title[0] if title else "Untitled Book"
            })
    return books

# 生成语义嵌入
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = mean_pooling(outputs, inputs['attention_mask'])
    return F.normalize(embeddings, p=2, dim=1)

# 平均池化
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embedding * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 智能摘要生成
def generate_summary(text):
    inputs = tokenizer(
        "summarize: " + text,
        max_length=1024,
        truncation=True,
        return_tensors="pt"
    )
    summary_ids = summarizer.model.generate(
        inputs.input_ids,
        max_length=150,
        min_length=50,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 核心推荐逻辑
def recommend_books(keywords, top_k=5):
    # 清洗输入
    keywords = re.sub(r'[^\w\s,]', '', keywords).lower()
    keywords = [k.strip() for k in keywords.split(',') if k.strip()]
    
    if len(keywords) < 2:
        return "❗ Please enter at least 2 keywords (e.g. 'fantasy, magic')"
    
    # 获取嵌入
    keyword_emb = get_embeddings([" ".join(keywords)]).mean(dim=0)
    book_embs = get_embeddings([f"{b['title']} {b['text']}" for b in books])
    
    # 计算相似度
    sim_scores = cosine_similarity(keyword_emb.reshape(1,-1), book_embs)[0]
    top_indices = np.argsort(sim_scores)[-top_k:][::-1]
    
    # 生成结果
    results = []
    for idx in top_indices:
        book = books[idx]
        summary = generate_summary(book['text'])
        results.append({
            "title": book['title'],
            "summary": summary,
            "score": f"{sim_scores[idx]:.2f}"
        })
    return results

# Gradio界面
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 智能图书推荐系统")
    
    with gr.Row():
        inputs = gr.Textbox(label="输入关键词(用逗号分隔)", placeholder="例如:sci-fi, time travel")
        outputs = gr.JSON(label="推荐结果")
    
    examples = gr.Examples(
        examples=[
            ["romance, paris"],
            ["mystery, detective"],
            ["science fiction, space opera"]
        ],
        inputs=[inputs]
    )
    
    inputs.submit(
        fn=recommend_books,
        inputs=inputs,
        outputs=outputs
    )

# 初始化数据
print("Loading book data...")
books = load_books()
print(f"Loaded {len(books)} books")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)