Spaces:

wayne0603
/

space

Sleeping

App Files Files Community

wayne0603 commited on Sep 1, 2025

Commit

0aee8ad

verified ·

1 Parent(s): f955893

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -7

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import faiss
 import gradio as gr
 from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModel, pipeline
 # ===== 嵌入模型 =====
 embed_model = AutoModel.from_pretrained(
@@ -47,11 +49,21 @@ def load_file(file_obj):
                 page_text = page.extract_text()
                 if page_text:
                     text_data += page_text + "\n"
         elif ext == ".txt":
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 text_data = f.read()
         else:
-            return "仅支持 PDF 或 TXT 文件", None
     except Exception as e:
         return f"文件解析失败: {str(e)}", None
@@ -82,7 +94,7 @@ def rag_query(query):
     if index is None or not docs:
         return "请先上传文件并构建知识库"
     q_emb = embed_text(query).reshape(1, -1)
-    D, I = index.search(q_emb, k=5)  # Top-K=5
     retrieved = [docs[i]["text"] for i in I[0]]
     context = "\n".join([f"[{idx+1}] {txt}" for idx, txt in enumerate(retrieved)])
@@ -92,25 +104,25 @@ def rag_query(query):
 问题：{query}
 要求：
-1. 仅依据已知信息回答
 2. 无法回答时直接说“我不知道”
 3. 在回答中标注引用的片段编号
 """
-    result = generator(prompt, max_length=300, do_sample=False)
     answer = result[0]["generated_text"]
     return f"回答：\n{answer}\n\n参考片段：\n{context}"
 # ===== Gradio 界面 =====
 with gr.Blocks() as demo:
-    gr.Markdown("## 📚 加强版 RAG（Qwen 1.8B + 引用显示）")
     with gr.Row():
-        file_input = gr.File(label="上传 PDF 或 TXT 文件")
         load_btn = gr.Button("构建知识库")
     status = gr.Textbox(label="状态")
     query_input = gr.Textbox(label="输入你的问题")
-    answer_output = gr.Textbox(label="回答", lines=10)
     load_btn.click(load_file, inputs=file_input, outputs=status)
     query_input.submit(rag_query, inputs=query_input, outputs=answer_output)

 import gradio as gr
 from PyPDF2 import PdfReader
 from transformers import AutoTokenizer, AutoModel, pipeline
+from ebooklib import epub
+from bs4 import BeautifulSoup
 # ===== 嵌入模型 =====
 embed_model = AutoModel.from_pretrained(
                 page_text = page.extract_text()
                 if page_text:
                     text_data += page_text + "\n"
         elif ext == ".txt":
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 text_data = f.read()
+        elif ext == ".epub":
+            book = epub.read_epub(file_path)
+            for item in book.get_items():
+                if item.get_type() == 9:  # ITEM_DOCUMENT
+                    soup = BeautifulSoup(item.get_content(), "html.parser")
+                    text_data += soup.get_text() + "\n"
         else:
+            return "仅支持 PDF / TXT / EPUB 文件", None
     except Exception as e:
         return f"文件解析失败: {str(e)}", None
     if index is None or not docs:
         return "请先上传文件并构建知识库"
     q_emb = embed_text(query).reshape(1, -1)
+    D, I = index.search(q_emb, k=8)  # Top-K=8
     retrieved = [docs[i]["text"] for i in I[0]]
     context = "\n".join([f"[{idx+1}] {txt}" for idx, txt in enumerate(retrieved)])
 问题：{query}
 要求：
+1. 整合所有引用片段的信息回答
 2. 无法回答时直接说“我不知道”
 3. 在回答中标注引用的片段编号
 """
+    result = generator(prompt, max_length=500, do_sample=False)
     answer = result[0]["generated_text"]
     return f"回答：\n{answer}\n\n参考片段：\n{context}"
 # ===== Gradio 界面 =====
 with gr.Blocks() as demo:
+    gr.Markdown("## 📚 完整性增强版 RAG（PDF/TXT/EPUB 支持 + 引用显示）")
     with gr.Row():
+        file_input = gr.File(label="上传 PDF / TXT / EPUB 文件")
         load_btn = gr.Button("构建知识库")
     status = gr.Textbox(label="状态")
     query_input = gr.Textbox(label="输入你的问题")
+    answer_output = gr.Textbox(label="回答", lines=12)
     load_btn.click(load_file, inputs=file_input, outputs=status)
     query_input.submit(rag_query, inputs=query_input, outputs=answer_output)