Create app
Browse files
app
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 导入必要库
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import torch
|
| 5 |
+
from sentence_transformers import SentenceTransformer, util
|
| 6 |
+
from transformers import pipeline
|
| 7 |
+
|
| 8 |
+
# ----------------------
|
| 9 |
+
# 1. 加载数据集
|
| 10 |
+
# ----------------------
|
| 11 |
+
def load_book_data():
|
| 12 |
+
# 加载 bookcorpus 数据集(仅保留标题和摘要)
|
| 13 |
+
dataset = load_dataset("bookcorpus", split="train")
|
| 14 |
+
books = pd.DataFrame(dataset)[["title", "text"]].rename(columns={"text": "description"})
|
| 15 |
+
|
| 16 |
+
# 过滤空值并截断长文本(可选)
|
| 17 |
+
books = books.dropna().head(1000) # 取前1000条数据便于演示
|
| 18 |
+
books["description"] = books["description"].apply(lambda x: x[:5000]) # 截断至5000字以内
|
| 19 |
+
return books
|
| 20 |
+
|
| 21 |
+
# ----------------------
|
| 22 |
+
# 2. 初始化模型
|
| 23 |
+
# ----------------------
|
| 24 |
+
def initialize_models():
|
| 25 |
+
# 语义搜索模型
|
| 26 |
+
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 27 |
+
|
| 28 |
+
# 摘要生成模型
|
| 29 |
+
summarizer = pipeline(
|
| 30 |
+
"summarization",
|
| 31 |
+
model="facebook/bart-large-cnn",
|
| 32 |
+
max_length=150,
|
| 33 |
+
min_length=30,
|
| 34 |
+
do_sample=False
|
| 35 |
+
)
|
| 36 |
+
return embedder, summarizer
|
| 37 |
+
|
| 38 |
+
# ----------------------
|
| 39 |
+
# 3. 关键词搜索与推荐
|
| 40 |
+
# ----------------------
|
| 41 |
+
def search_similar_books(keywords, books, embedder, top_k=5):
|
| 42 |
+
# 生成关键词嵌入
|
| 43 |
+
keyword_embedding = embedder.encode(keywords, convert_to_tensor=True)
|
| 44 |
+
|
| 45 |
+
# 生成书籍嵌入(批量处理)
|
| 46 |
+
book_embeddings = torch.stack([
|
| 47 |
+
embedder.encode(title + " " + desc, convert_to_tensor=True)
|
| 48 |
+
for title, desc in zip(books["title"], books["description"])
|
| 49 |
+
])
|
| 50 |
+
|
| 51 |
+
# 计算余弦相似度
|
| 52 |
+
cos_scores = util.cos_sim(keyword_embedding, book_embeddings)[0]
|
| 53 |
+
|
| 54 |
+
# 获取 top-k 结果
|
| 55 |
+
top_results = torch.topk(cos_scores, k=top_k).indices.tolist()
|
| 56 |
+
return books.iloc[top_results]
|
| 57 |
+
|
| 58 |
+
# ----------------------
|
| 59 |
+
# 4. 生成摘要并输出
|
| 60 |
+
# ----------------------
|
| 61 |
+
def generate_book_summaries(books, summarizer):
|
| 62 |
+
results = []
|
| 63 |
+
for idx, row in books.iterrows():
|
| 64 |
+
summary = summarizer(row["description"], max_length=150)[0]["summary_text"]
|
| 65 |
+
results.append({
|
| 66 |
+
"title": row["title"],
|
| 67 |
+
"summary": summary,
|
| 68 |
+
"similarity": "{:.2f}".format(float(cos_scores[idx])) # 可选:添加相似度分数
|
| 69 |
+
})
|
| 70 |
+
return results
|
| 71 |
+
|
| 72 |
+
# ----------------------
|
| 73 |
+
# 5. 主函数与交互
|
| 74 |
+
# ----------------------
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
# 加载数据与模型
|
| 77 |
+
books = load_book_data()
|
| 78 |
+
embedder, summarizer = initialize_models()
|
| 79 |
+
|
| 80 |
+
# 用户输入关键词
|
| 81 |
+
user_keywords = "fantasy adventure magic" # 示例关键词,可替换为用户输入
|
| 82 |
+
|
| 83 |
+
# 执行搜索与摘要生成
|
| 84 |
+
similar_books = search_similar_books(user_keywords, books, embedder)
|
| 85 |
+
summaries = generate_book_summaries(similar_books, summarizer)
|
| 86 |
+
|
| 87 |
+
# 打印结果
|
| 88 |
+
for i, book in enumerate(summaries, 1):
|
| 89 |
+
print(f"📚 Book {i}: {book['title']}")
|
| 90 |
+
print(f"🌟 Similarity: {book['similarity']}")
|
| 91 |
+
print(f"📝 Summary: {book['summary']}\n")
|