wayne0603 commited on
Commit
0aee8ad
·
verified ·
1 Parent(s): f955893

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -7
app.py CHANGED
@@ -5,6 +5,8 @@ import faiss
5
  import gradio as gr
6
  from PyPDF2 import PdfReader
7
  from transformers import AutoTokenizer, AutoModel, pipeline
 
 
8
 
9
  # ===== 嵌入模型 =====
10
  embed_model = AutoModel.from_pretrained(
@@ -47,11 +49,21 @@ def load_file(file_obj):
47
  page_text = page.extract_text()
48
  if page_text:
49
  text_data += page_text + "\n"
 
50
  elif ext == ".txt":
51
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
52
  text_data = f.read()
 
 
 
 
 
 
 
 
53
  else:
54
- return "仅支持 PDF TXT 文件", None
 
55
  except Exception as e:
56
  return f"文件解析失败: {str(e)}", None
57
 
@@ -82,7 +94,7 @@ def rag_query(query):
82
  if index is None or not docs:
83
  return "请先上传文件并构建知识库"
84
  q_emb = embed_text(query).reshape(1, -1)
85
- D, I = index.search(q_emb, k=5) # Top-K=5
86
  retrieved = [docs[i]["text"] for i in I[0]]
87
  context = "\n".join([f"[{idx+1}] {txt}" for idx, txt in enumerate(retrieved)])
88
 
@@ -92,25 +104,25 @@ def rag_query(query):
92
  问题:{query}
93
 
94
  要求:
95
- 1. 仅依据已知信息回答
96
  2. 无法回答时直接说“我不知道”
97
  3. 在回答中标注引用的片段编号
98
  """
99
 
100
- result = generator(prompt, max_length=300, do_sample=False)
101
  answer = result[0]["generated_text"]
102
 
103
  return f"回答:\n{answer}\n\n参考片段:\n{context}"
104
 
105
  # ===== Gradio 界面 =====
106
  with gr.Blocks() as demo:
107
- gr.Markdown("## 📚 强版 RAG(Qwen 1.8B + 引用显示)")
108
  with gr.Row():
109
- file_input = gr.File(label="上传 PDF TXT 文件")
110
  load_btn = gr.Button("构建知识库")
111
  status = gr.Textbox(label="状态")
112
  query_input = gr.Textbox(label="输入你的问题")
113
- answer_output = gr.Textbox(label="回答", lines=10)
114
  load_btn.click(load_file, inputs=file_input, outputs=status)
115
  query_input.submit(rag_query, inputs=query_input, outputs=answer_output)
116
 
 
5
  import gradio as gr
6
  from PyPDF2 import PdfReader
7
  from transformers import AutoTokenizer, AutoModel, pipeline
8
+ from ebooklib import epub
9
+ from bs4 import BeautifulSoup
10
 
11
  # ===== 嵌入模型 =====
12
  embed_model = AutoModel.from_pretrained(
 
49
  page_text = page.extract_text()
50
  if page_text:
51
  text_data += page_text + "\n"
52
+
53
  elif ext == ".txt":
54
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
55
  text_data = f.read()
56
+
57
+ elif ext == ".epub":
58
+ book = epub.read_epub(file_path)
59
+ for item in book.get_items():
60
+ if item.get_type() == 9: # ITEM_DOCUMENT
61
+ soup = BeautifulSoup(item.get_content(), "html.parser")
62
+ text_data += soup.get_text() + "\n"
63
+
64
  else:
65
+ return "仅支持 PDF / TXT / EPUB 文件", None
66
+
67
  except Exception as e:
68
  return f"文件解析失败: {str(e)}", None
69
 
 
94
  if index is None or not docs:
95
  return "请先上传文件并构建知识库"
96
  q_emb = embed_text(query).reshape(1, -1)
97
+ D, I = index.search(q_emb, k=8) # Top-K=8
98
  retrieved = [docs[i]["text"] for i in I[0]]
99
  context = "\n".join([f"[{idx+1}] {txt}" for idx, txt in enumerate(retrieved)])
100
 
 
104
  问题:{query}
105
 
106
  要求:
107
+ 1. 整合所有引用片段的信息回答
108
  2. 无法回答时直接说“我不知道”
109
  3. 在回答中标注引用的片段编号
110
  """
111
 
112
+ result = generator(prompt, max_length=500, do_sample=False)
113
  answer = result[0]["generated_text"]
114
 
115
  return f"回答:\n{answer}\n\n参考片段:\n{context}"
116
 
117
  # ===== Gradio 界面 =====
118
  with gr.Blocks() as demo:
119
+ gr.Markdown("## 📚 完整性增强版 RAG(PDF/TXT/EPUB 支持 + 引用显示)")
120
  with gr.Row():
121
+ file_input = gr.File(label="上传 PDF / TXT / EPUB 文件")
122
  load_btn = gr.Button("构建知识库")
123
  status = gr.Textbox(label="状态")
124
  query_input = gr.Textbox(label="输入你的问题")
125
+ answer_output = gr.Textbox(label="回答", lines=12)
126
  load_btn.click(load_file, inputs=file_input, outputs=status)
127
  query_input.submit(rag_query, inputs=query_input, outputs=answer_output)
128