File size: 4,182 Bytes
59ea9db 101fcf4 59ea9db 101fcf4 0aee8ad 59ea9db 101fcf4 59ea9db 101fcf4 59ea9db 101fcf4 59ea9db 101fcf4 59ea9db a67409b 0f0afda a67409b 0f0afda 0aee8ad 0f0afda a67409b 0aee8ad 0f0afda 0aee8ad 0f0afda 59ea9db 101fcf4 59ea9db 101fcf4 59ea9db a67409b 16873bc 59ea9db 101fcf4 59ea9db 3783dc0 59ea9db 101fcf4 16873bc 101fcf4 16873bc 101fcf4 16873bc 3783dc0 59ea9db 16873bc 59ea9db 0aee8ad 59ea9db 16873bc 59ea9db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
import torch
import numpy as np
import faiss
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModel, pipeline
from ebooklib import epub
from bs4 import BeautifulSoup
# ===== 嵌入模型 =====
embed_model = AutoModel.from_pretrained(
"BAAI/bge-small-zh", trust_remote_code=True
)
embed_tokenizer = AutoTokenizer.from_pretrained(
"BAAI/bge-small-zh", trust_remote_code=True
)
def embed_text(text):
inputs = embed_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
embeddings = embed_model(**inputs).last_hidden_state[:, 0, :]
return embeddings[0].numpy()
# ===== 生成模型(Qwen 1.8B) =====
generator = pipeline(
"text-generation",
model="Qwen/Qwen1.5-1.8B-Chat",
device=-1
)
# ===== 全局变量 =====
index = None
docs = []
# ===== 文件解析 =====
def load_file(file_obj):
global index, docs
docs = []
text_data = ""
file_path = file_obj.name if hasattr(file_obj, "name") else file_obj
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
reader = PdfReader(file_path)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_data += page_text + "\n"
elif ext == ".txt":
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
text_data = f.read()
elif ext == ".epub":
book = epub.read_epub(file_path)
for item in book.get_items():
if item.get_type() == 9: # ITEM_DOCUMENT
soup = BeautifulSoup(item.get_content(), "html.parser")
text_data += soup.get_text() + "\n"
else:
return "仅支持 PDF / TXT / EPUB 文件", None
except Exception as e:
return f"文件解析失败: {str(e)}", None
if not text_data.strip():
return "未能从文件中提取到文本", None
# 分块(350字 + 100字重叠)
chunk_size = 350
overlap = 100
start = 0
chunks = []
while start < len(text_data):
end = min(start + chunk_size, len(text_data))
chunks.append(text_data[start:end])
start += chunk_size - overlap
docs = [{"text": chunk, "source": f"chunk_{i}"} for i, chunk in enumerate(chunks)]
# 向量化 & 建索引
doc_embeddings = np.array([embed_text(d["text"]) for d in docs])
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)
return f"已加载 {len(docs)} 个文本块", None
# ===== RAG 查询 =====
def rag_query(query):
if index is None or not docs:
return "请先上传文件并构建知识库"
q_emb = embed_text(query).reshape(1, -1)
D, I = index.search(q_emb, k=8)
retrieved = [docs[i]["text"] for i in I[0]]
context = "\n".join([f"[{idx+1}] {txt}" for idx, txt in enumerate(retrieved)])
prompt = f"""已知信息:
{context}
问题:{query}
请严格按照以下格式输出:
【结论】
用 2-3 句话总结所有引用片段的关键信息,形成一个完整结论。
【详细说明】
整合所有引用片段的细节,分段描述,并在每个关键信息后标注引用编号。
无法回答时直接说“我不知道”。
【引用片段】
逐条列出引用编号及对应的原文。
"""
result = generator(prompt, max_length=800, do_sample=False)
answer = result[0]["generated_text"]
return answer
# ===== Gradio 界面 =====
with gr.Blocks() as demo:
gr.Markdown("## 📚 完整性增强版 RAG(PDF/TXT/EPUB 支持 + 结论 + 引用)")
with gr.Row():
file_input = gr.File(label="上传 PDF / TXT / EPUB 文件")
load_btn = gr.Button("构建知识库")
status = gr.Textbox(label="状态")
query_input = gr.Textbox(label="输入你的问题")
answer_output = gr.Textbox(label="回答", lines=15)
load_btn.click(load_file, inputs=file_input, outputs=status)
query_input.submit(rag_query, inputs=query_input, outputs=answer_output)
if __name__ == "__main__":
demo.launch()
|