File size: 4,001 Bytes
bb78694 f17e48b bb78694 f17e48b bb78694 f17e48b bb78694 b5ffc72 bb78694 8fe9808 b5ffc72 bb78694 4505c1f bb78694 4505c1f bb78694 b5ffc72 bb78694 4505c1f bb78694 8fe9808 bb78694 8fe9808 bb78694 8fe9808 bb78694 8fe9808 bb78694 8fe9808 4505c1f bb78694 b5ffc72 bb78694 8fe9808 bb78694 8fe9808 bb78694 b5ffc72 bb78694 b5ffc72 bb78694 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# app.py
from datasets import load_dataset
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import re
# 全局配置
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # 更强大的语义模型
SUMMARIZER_NAME = "facebook/bart-large-cnn"
DATASET_NAME = "bookcorpus"
CACHE_DIR = "./data-cache"
# 预加载资源
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
summarizer = pipeline("summarization", SUMMARIZER_NAME)
# 加载并预处理书籍数据
def load_books():
dataset = load_dataset(DATASET_NAME, split='train', streaming=True)
books = []
for book in dataset.take(50000): # 取5万本书
text = book['text'].strip()
if len(text) > 500: # 过滤短文本
title = re.findall(r'"([^"]*)"', text[:200]) # 尝试提取标题
books.append({
"text": text,
"title": title[0] if title else "Untitled Book"
})
return books
# 生成语义嵌入
def get_embeddings(texts):
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = mean_pooling(outputs, inputs['attention_mask'])
return F.normalize(embeddings, p=2, dim=1)
# 平均池化
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output.last_hidden_state
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embedding * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# 智能摘要生成
def generate_summary(text):
inputs = tokenizer(
"summarize: " + text,
max_length=1024,
truncation=True,
return_tensors="pt"
)
summary_ids = summarizer.model.generate(
inputs.input_ids,
max_length=150,
min_length=50,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# 核心推荐逻辑
def recommend_books(keywords, top_k=5):
# 清洗输入
keywords = re.sub(r'[^\w\s,]', '', keywords).lower()
keywords = [k.strip() for k in keywords.split(',') if k.strip()]
if len(keywords) < 2:
return "❗ Please enter at least 2 keywords (e.g. 'fantasy, magic')"
# 获取嵌入
keyword_emb = get_embeddings([" ".join(keywords)]).mean(dim=0)
book_embs = get_embeddings([f"{b['title']} {b['text']}" for b in books])
# 计算相似度
sim_scores = cosine_similarity(keyword_emb.reshape(1,-1), book_embs)[0]
top_indices = np.argsort(sim_scores)[-top_k:][::-1]
# 生成结果
results = []
for idx in top_indices:
book = books[idx]
summary = generate_summary(book['text'])
results.append({
"title": book['title'],
"summary": summary,
"score": f"{sim_scores[idx]:.2f}"
})
return results
# Gradio界面
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 📚 智能图书推荐系统")
with gr.Row():
inputs = gr.Textbox(label="输入关键词(用逗号分隔)", placeholder="例如:sci-fi, time travel")
outputs = gr.JSON(label="推荐结果")
examples = gr.Examples(
examples=[
["romance, paris"],
["mystery, detective"],
["science fiction, space opera"]
],
inputs=[inputs]
)
inputs.submit(
fn=recommend_books,
inputs=inputs,
outputs=outputs
)
# 初始化数据
print("Loading book data...")
books = load_books()
print(f"Loaded {len(books)} books")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |