sshenai commited on
Commit
b5ffc72
·
verified ·
1 Parent(s): d8c8eca

Create app

Browse files
Files changed (1) hide show
  1. app +91 -0
app ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 导入必要库
2
+ from datasets import load_dataset
3
+ import pandas as pd
4
+ import torch
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from transformers import pipeline
7
+
8
+ # ----------------------
9
+ # 1. 加载数据集
10
+ # ----------------------
11
+ def load_book_data():
12
+ # 加载 bookcorpus 数据集(仅保留标题和摘要)
13
+ dataset = load_dataset("bookcorpus", split="train")
14
+ books = pd.DataFrame(dataset)[["title", "text"]].rename(columns={"text": "description"})
15
+
16
+ # 过滤空值并截断长文本(可选)
17
+ books = books.dropna().head(1000) # 取前1000条数据便于演示
18
+ books["description"] = books["description"].apply(lambda x: x[:5000]) # 截断至5000字以内
19
+ return books
20
+
21
+ # ----------------------
22
+ # 2. 初始化模型
23
+ # ----------------------
24
+ def initialize_models():
25
+ # 语义搜索模型
26
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+
28
+ # 摘要生成模型
29
+ summarizer = pipeline(
30
+ "summarization",
31
+ model="facebook/bart-large-cnn",
32
+ max_length=150,
33
+ min_length=30,
34
+ do_sample=False
35
+ )
36
+ return embedder, summarizer
37
+
38
+ # ----------------------
39
+ # 3. 关键词搜索与推荐
40
+ # ----------------------
41
+ def search_similar_books(keywords, books, embedder, top_k=5):
42
+ # 生成关键词嵌入
43
+ keyword_embedding = embedder.encode(keywords, convert_to_tensor=True)
44
+
45
+ # 生成书籍嵌入(批量处理)
46
+ book_embeddings = torch.stack([
47
+ embedder.encode(title + " " + desc, convert_to_tensor=True)
48
+ for title, desc in zip(books["title"], books["description"])
49
+ ])
50
+
51
+ # 计算余弦相似度
52
+ cos_scores = util.cos_sim(keyword_embedding, book_embeddings)[0]
53
+
54
+ # 获取 top-k 结果
55
+ top_results = torch.topk(cos_scores, k=top_k).indices.tolist()
56
+ return books.iloc[top_results]
57
+
58
+ # ----------------------
59
+ # 4. 生成摘要并输出
60
+ # ----------------------
61
+ def generate_book_summaries(books, summarizer):
62
+ results = []
63
+ for idx, row in books.iterrows():
64
+ summary = summarizer(row["description"], max_length=150)[0]["summary_text"]
65
+ results.append({
66
+ "title": row["title"],
67
+ "summary": summary,
68
+ "similarity": "{:.2f}".format(float(cos_scores[idx])) # 可选:添加相似度分数
69
+ })
70
+ return results
71
+
72
+ # ----------------------
73
+ # 5. 主函数与交互
74
+ # ----------------------
75
+ if __name__ == "__main__":
76
+ # 加载数据与模型
77
+ books = load_book_data()
78
+ embedder, summarizer = initialize_models()
79
+
80
+ # 用户输入关键词
81
+ user_keywords = "fantasy adventure magic" # 示例关键词,可替换为用户输入
82
+
83
+ # 执行搜索与摘要生成
84
+ similar_books = search_similar_books(user_keywords, books, embedder)
85
+ summaries = generate_book_summaries(similar_books, summarizer)
86
+
87
+ # 打印结果
88
+ for i, book in enumerate(summaries, 1):
89
+ print(f"📚 Book {i}: {book['title']}")
90
+ print(f"🌟 Similarity: {book['similarity']}")
91
+ print(f"📝 Summary: {book['summary']}\n")