Orcinus_ commited on
Commit
fe68dd4
·
1 Parent(s): 79ee2f8

Add application file

Browse files
Files changed (10) hide show
  1. .env +3 -0
  2. README.md +2 -2
  3. app.py +59 -0
  4. conversation_manager.py +170 -0
  5. get_client.py +42 -0
  6. pdf_files.tar.gz +3 -0
  7. rag_chain.py +174 -0
  8. requirements.txt +7 -0
  9. retriever_builder.py +142 -0
  10. utils.py +95 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ IFLYTEK_SPARK_APP_ID=dc6765b2
2
+ IFLYTEK_SPARK_API_SECRET=YWQ4NWUyYWI3NTdjZTMwNjY4YzJjMGJm
3
+ IFLYTEK_SPARK_API_KEY=81232c7c38284a9e1d4e86026ca69739
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: My Rag App
3
- emoji: 📈
4
  colorFrom: yellow
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.43.1
8
  app_file: app.py
 
1
  ---
2
  title: My Rag App
3
+ emoji: 💻
4
  colorFrom: yellow
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.43.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import user_asks
3
+
4
+ # Gradio 界面设置
5
+ with gr.Blocks(title="五险一金规划助手", theme=gr.themes.Soft()) as demo:
6
+ gr.Markdown("## 🧑‍🏫应届生的第一个五险一金社保规划师")
7
+
8
+ with gr.Row():
9
+ with gr.Column(scale=2):
10
+ chatbot = gr.Chatbot(label="对话框", type="messages")
11
+ user_input = gr.Textbox(label="请输入你的问题", placeholder="如:我现在即将入职私企算法工程师岗位,想知道试用期公司会给我交社保和公积金吗?", lines=3)
12
+ # 按钮可保留,但不需要 click 事件
13
+ send_button = gr.Button("发送")
14
+ # stop_button = gr.Button("⬛停止生成", variant="stop")
15
+
16
+ # with gr.Column(scale=1):
17
+ # gr.Markdown("#### 你的意图")
18
+ # user_intent = gr.Radio(["简单科普", "急需帮助", "帮我避坑"], label="当前意图", value="简单科普")
19
+
20
+ with gr.Column(scale=1):
21
+ gr.Markdown("#### 你的基本信息")
22
+ situation = gr.Dropdown(["正在求职", "即将入职", "还在上学", "实习中", "准备升学", "其他"], label="目前状态", value="即将入职")
23
+ job_input = gr.Textbox(label="你的工作/岗位", placeholder="如:程序员、教师、自由职业者等")
24
+ city = gr.Textbox(label="所在城市", placeholder="如:北京、上海")
25
+ age = gr.Textbox(label="年龄", value="25")
26
+ user_goal = gr.Textbox(label="你未来的计划是什么?", placeholder="如:我目前打算先工作,未来可能跳槽。", lines=3)
27
+ other_info = gr.Textbox(label="其他补充信息(可选)")
28
+
29
+
30
+ # 设置输入框和按钮
31
+ click_event = user_input.submit(
32
+ fn=user_asks,
33
+ inputs=[user_input, chatbot, user_goal, job_input, situation,
34
+ city, age, other_info],
35
+ outputs=[chatbot, user_input], # user_input 清空输入框
36
+ queue=True # 启用队列,处理快速输入
37
+ )
38
+
39
+ # 设置发送按钮
40
+ submit_event = send_button.click(
41
+ fn=user_asks,
42
+ inputs=[user_input, chatbot, user_goal, job_input, situation,
43
+ city, age, other_info],
44
+ outputs=[chatbot, user_input],
45
+ queue=True
46
+ )
47
+
48
+ # 设置取消按钮
49
+ # stop_button.click(
50
+ # fn=None,
51
+ # inputs=None,
52
+ # outputs=None,
53
+ # cancels=[submit_event, click_event],
54
+ # queue=False
55
+ # )
56
+
57
+ # 运行 Gradio 应用
58
+ demo.launch(share=True)
59
+
conversation_manager.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import contextlib
3
+ from typing import Optional, Dict, List
4
+ from langchain_core.runnables.config import RunnableConfig
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.schema import HumanMessage
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ from langchain.retrievers import MultiQueryRetriever
9
+ from rag_chain import build_user_info, build_qa_chain, rerank
10
+ from get_client import get_client
11
+ from langchain.callbacks.base import BaseCallbackHandler
12
+
13
+ class SilentMultiQueryRetriever(MultiQueryRetriever):
14
+ def invoke(self, input: str, config: Optional[RunnableConfig] = None, **kwargs):
15
+ # 静默执行多查询生成,不输出任何内容
16
+ with disable_stdout():
17
+ return super().invoke(input, config, **kwargs)
18
+ @contextlib.contextmanager
19
+ def disable_stdout():
20
+ with open(os.devnull, "w") as f:
21
+ old_stdout = os.dup(1)
22
+ os.dup2(f.fileno(), 1)
23
+ try:
24
+ yield
25
+ finally:
26
+ os.dup2(old_stdout, 1)
27
+
28
+ class StreamingHandler(BaseCallbackHandler):
29
+ def __init__(self):
30
+ self.tokens = []
31
+
32
+ def on_llm_new_token(self, token: str, **kwargs):
33
+ self.tokens.append(token)
34
+
35
+ def get_response(self):
36
+ return "".join(self.tokens)
37
+
38
+ class PlannerAgent:
39
+ def __init__(self, retriever):
40
+ self.llm = get_client()
41
+ self.retriever = retriever
42
+ self.multi_retriever = SilentMultiQueryRetriever.from_llm(
43
+ retriever=self.retriever,
44
+ llm=self.llm)
45
+ self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
46
+ self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
47
+ self.model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-base")
48
+ # self._should_stop = True # 新增:中断标志
49
+ # print("PlannerAgent initialized.")
50
+
51
+ # def stop_generation(self):
52
+ # """设置中断标志,用于停止 stream_reply 的生成"""
53
+ # self._should_stop = True
54
+ # print("PlannerAgent: 设置中断标志为 True")
55
+
56
+ # def reset_stop_flag(self):
57
+ # """重置中断标志,以便下次生成能够正常开始"""
58
+ # self._should_stop = False
59
+ # print("PlannerAgent: 重置中断标志为 False")
60
+
61
+ def get_history(self) -> List[Dict[str, str]]:
62
+ result = []
63
+ for msg in self.memory.chat_memory.messages:
64
+ role = "user" if isinstance(msg, HumanMessage) else "assistant"
65
+ result.append({"role": role, "content": msg.content})
66
+ return result
67
+
68
+ def reply(self, user_question: str, user_info_dict: Dict) -> str:
69
+ qa_chain, _ = build_qa_chain(self.llm,
70
+ self.multi_retriever,
71
+ self.tokenizer,
72
+ self.model)
73
+
74
+ user_info_text = build_user_info(user_info_dict)
75
+ docs = self.retriever.get_relevant_documents(user_question)
76
+ reranked_docs = rerank(self.tokenizer, self.model, user_question, docs, 5)
77
+ context = "\n".join([doc.page_content for doc in reranked_docs])
78
+
79
+ self.memory.chat_memory.add_user_message(user_question)
80
+ response = qa_chain.invoke({
81
+ "user_info": user_info_text,
82
+ "context": context
83
+ })
84
+ self.memory.chat_memory.add_ai_message(response)
85
+ return response
86
+
87
+ def stream_reply(self, user_question: str, user_info_dict: Dict) -> str:
88
+ # print(f"\n--- Entering stream_reply ---")
89
+ # print(f"Message received by agent: {user_question}")
90
+ # print(f"User info received by agent: {user_info_dict}")
91
+ # self.reset_stop_flag()
92
+
93
+ qa_chain, _ = build_qa_chain(llm=self.llm,
94
+ multi_retriever=self.multi_retriever,
95
+ model=self.model,
96
+ tokenizer=self.tokenizer)
97
+
98
+ user_info_text = build_user_info(user_info_dict)
99
+ current_chat_history = self.memory.buffer_as_messages
100
+
101
+ full_response_content = "" # 用于累积完整响应以便存入 memory
102
+
103
+ # 使用 qa_chain.stream() 进行流式处理
104
+ try:
105
+ for chunk in qa_chain.stream(
106
+ {
107
+ "chat_history": current_chat_history,
108
+ "user_info": user_info_text,
109
+ "question": user_question
110
+ }
111
+ ):
112
+ if isinstance(chunk, str):
113
+ content_to_yield = chunk
114
+ elif hasattr(chunk, 'content') and isinstance(chunk.content, str):
115
+ content_to_yield = chunk.content
116
+ else:
117
+ continue
118
+
119
+ if content_to_yield:
120
+ full_response_content += content_to_yield
121
+ yield content_to_yield # 每次 yield 实时生成的令牌或小块
122
+
123
+ # 只有在没有被中断的情况下才将完整响应添加到内存
124
+ self.memory.save_context({"question": user_question}, {"answer": full_response_content})
125
+ print(f"--- 响应已保存到内存: {full_response_content[:50]}... ---")
126
+
127
+ except KeyboardInterrupt:
128
+ print("PlannerAgent: 捕获到 KeyboardInterrupt,停止生成且不保存到内存。")
129
+ pass
130
+ except Exception as e:
131
+ print(f"错误:流式处理过程中发生异常:{e}")
132
+ yield f"发生错误:{e}"
133
+ print("--- 发生错误,不保存到内存。 ---")
134
+
135
+ self.memory.chat_memory.add_user_message(user_question)
136
+ self.memory.chat_memory.add_ai_message(full_response_content)
137
+
138
+
139
+ if __name__ == "__main__":
140
+ from get_client import *
141
+ from retriever_builder import process_pdfs_to_chunks, save_embeddings
142
+
143
+ pdf_paths = "/teamspace/studios/this_studio/pdf_files"
144
+ all_chunks = process_pdfs_to_chunks(pdf_paths)
145
+ vectordb = save_embeddings(all_chunks)
146
+ retriever = vectordb.as_retriever(search_kwargs={"k": 4})
147
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
148
+ model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-base")
149
+
150
+ rag_chain, memory = build_qa_chain(
151
+ llm=get_client(),
152
+ multi_retriever=retriever,
153
+ model=model,
154
+ tokenizer=tokenizer
155
+ )
156
+
157
+ test_user_info = {
158
+ "job_info": "软件工程师",
159
+ "situation": "已入职试用期",
160
+ "city": "上海",
161
+ "age": 26,
162
+ }
163
+ test_question = "试用期公司会给我交社保和公积金吗?"
164
+ agent = PlannerAgent(retriever=retriever)
165
+
166
+ # 流式输出测试(如果启用流式 Spark 回调)
167
+ print("\n流式输出测试:")
168
+ for chunk in agent.stream_reply(test_question, test_user_info):
169
+ # print(chunk, end="", flush=True)
170
+ pass
get_client.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import find_dotenv, load_dotenv
3
+ from typing import Optional, Any
4
+ from langchain_community.chat_models import ChatSparkLLM
5
+ from langchain.callbacks.base import BaseCallbackHandler
6
+
7
+ load_dotenv(find_dotenv())
8
+
9
+ class ChunkPrintHandler(BaseCallbackHandler):
10
+ def on_llm_new_token(self, token: str, *, chunk: Optional[Any] = None, **kwargs: Any):
11
+ print(token, end="", flush=True)
12
+ debug = False
13
+ if debug:
14
+ print("\n", kwargs)
15
+
16
+ def get_client():
17
+ app_id = os.getenv("IFLYTEK_SPARK_APP_ID")
18
+ api_key = os.getenv("IFLYTEK_SPARK_API_KEY")
19
+ api_secret = os.getenv("IFLYTEK_SPARK_API_SECRET")
20
+
21
+ if not all([app_id, api_key, api_secret]):
22
+ raise ValueError("请确保环境变量已设置。")
23
+
24
+ llm = ChatSparkLLM(
25
+ model='Spark4.0 Ultra',
26
+ app_id=app_id,
27
+ api_key=api_key,
28
+ api_secret=api_secret,
29
+ spark_api_url="wss://spark-api.xf-yun.com/v4.0/chat",
30
+ spark_llm_domain="4.0Ultra",
31
+ streaming=True,
32
+ callbacks=[ChunkPrintHandler()]
33
+ )
34
+
35
+ return llm
36
+
37
+ if __name__ == "__main__":
38
+ client = get_client()
39
+ print("Client initialized successfully.")
40
+ # 测试模型
41
+ response = client.invoke("你好,介绍一下你自己。")
42
+ print("\nResponse:", response)
pdf_files.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03ec065bb7dd6500e7be3226ca160664039ede192958405ff653c0c05cb8d750
3
+ size 44264675
rag_chain.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableMap
3
+ from langchain.memory import ConversationBufferMemory
4
+ from langchain.chains import RetrievalQA
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain.prompts import ChatPromptTemplate
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+
10
+ def rerank(tokenizer, model, query, docs, top_k=5):
11
+ pairs = [(query, doc.page_content) for doc in docs]
12
+ inputs = tokenizer([f"{q} [SEP] {d}" for q, d in pairs], return_tensors='pt', padding=True, truncation=True)
13
+ scores = model(**inputs).logits.view(-1)
14
+ scored_docs = list(zip(scores, docs))
15
+ sorted_scored_docs = sorted(scored_docs, key=lambda item: item[0], reverse=True)
16
+ # ranked_docs = [doc for _, doc in sorted(zip(scores, docs), reverse=True)]
17
+ ranked_docs = [doc for score, doc in sorted_scored_docs]
18
+ return ranked_docs[:top_k]
19
+
20
+ # 获取用户信息
21
+ def build_user_info(user_info: dict) -> str:
22
+ user_text = "用户相关信息如下:"
23
+
24
+ if user_info.get("goal", "未知"):
25
+ user_text += f"\n未来计划:{user_info['goal']}"
26
+ if user_info.get("job_info", "未知"):
27
+ user_text += f"\n从事工作:{user_info['job_info']}"
28
+ if user_info.get("job_type", "未知"):
29
+ user_text += f"\n工作类型:{user_info['job_type']}"
30
+ if user_info.get("age", "未知"):
31
+ user_text += f"\n年龄:{user_info['age']}"
32
+ if user_info.get("situation", "未知"):
33
+ user_text += f"\n目前状态:{user_info['situation']}"
34
+ if user_info.get("city", "未知"):
35
+ user_text += f"\n所在城市:{user_info['city']}"
36
+ if user_info.get("other_info", "无"):
37
+ user_text += f"\n其他补充信息:{user_info['other_info']}"
38
+
39
+ return user_text
40
+
41
+ # 构建支持流式输出的 stuff QA chain
42
+ def build_qa_chain(llm, multi_retriever, model, tokenizer):
43
+ prompt = ChatPromptTemplate.from_messages([
44
+ ("system",
45
+ """
46
+ 你是一个专业的、富有同理心的“应届生社保规划小助手”。
47
+ 你的核心任务是为即将或刚刚踏入社会的中国大学应届毕业生提供清晰、易懂、可执行的“五险一金”规划和建议。
48
+ 你的回答必须始终围绕应届毕业生的视角、需求和痛点展开,将“帮助新人顺利了解、入门、避免踩坑”作为你的第一原则
49
+ 请牢记以下内容:
50
+ - 你的沟通对象是对社保和公积金几乎一无所知的“小白”,比起冗长的政策细节,他们更关心哪些因素会影响到自己的未来之路。。
51
+ - 回答要围绕“帮助新人顺利了解、入门、避免踩坑”,以安心和实用为首要目标。
52
+ - 必须保证回答直接、准确,不要回避用户问题。
53
+
54
+ 【多轮对话规则】
55
+ 1. 如果这是**第一次提问**,可以进行简短的入门科普(“是什么”“有什么用”)。
56
+ 2. 如果用户是**多轮追问**,请避免重复基础科普内容,直接进入个性化回答,围绕新问题展开。
57
+ 3. 在多轮提问时,回答要紧扣用户新的关注点,并在需要时结合之前提供过的信息进行延伸,而不是重复。
58
+ 4. 如果用户的问题只是闲聊或模糊(如“你好”),请不要执行规划步骤,而是礼貌介绍自己,并引导用户提出与社保相关的具体问题。
59
+ """),
60
+ ("placeholder", "{chat_history}"),
61
+ ("human",
62
+ "用户信息(包含了用户的工作、现状、城市和年龄等):\n{user_info}\n\n"
63
+ "相关政策资料:\n{context}\n\n"
64
+ "我的问题是:'{question}'\n\n"
65
+
66
+ "请严格按照以下步骤,为我提供一份专属的社保规划建议:\n\n"
67
+
68
+ "**首先:关于你的问题(必须优先)**\n"
69
+ "- 先用清晰、简洁的语言直接回答用户问题。"
70
+ "- 如果有明确的结论,就直说,不要绕。"
71
+ "- 如果资料不足,要说明限制,并给出下一步可行的方向。\n\n"
72
+
73
+ "**(仅限首次提问时)简明入门科普**\n"
74
+ "- 在第一次提问时,可以补充简短的背景知识,让用户理解“五险一金是什么、有什么用”。"
75
+ "- 在后续多轮追问时,省略科普,只聚焦新问题。\n\n"
76
+
77
+ "**个性化规划与预测**\n"
78
+ "- 根据用户的个人信息和相关政策,提供操作指南:用户需要做什么、准备哪些材料、可能遇到什么情况。"
79
+ "- 回答要明确到步骤和要点。\n\n"
80
+
81
+ "**避坑提醒(重点)**\n"
82
+ "- 主动提示用户在当前情境下可能遇到的常见陷阱,并说明危害 + 对策。"
83
+ "- 避坑要简洁醒目,不要过度展开和重复。\n\n"
84
+
85
+ "**特殊情况处理**\n"
86
+ "- 如果问题涉及实习期、试用期、空窗期、自由职业、考研/考公,请额外提供换位思考后的建议。\n\n"
87
+
88
+ "--- 回答要求 ---\n"
89
+ "- **同理心与易懂性:** 语言必须平和、友好,充满鼓励,让刚毕业的大学生感到安心。\n"
90
+ "- **结构化:** 使用清晰的标题和项目符号,让内容一目了然。\n"
91
+ "- **准确性:** 严格基于提供的“相关政策资料”作答,若资料不足,可以联网搜索,但请诚实说明,并提供信息来源。\n"
92
+ "- **去冗余: ** 请确保回答内容简洁明了,避免重复相同的信息。如果多个文档包含相似内容,请整合这些信息,不要重复表述。\n"
93
+ "- **特殊情况处理:** 如果我的问题只是闲聊或不明确(例如“你好”),请不要执行以上规划步骤,而是礼貌地介绍你自己(应届生社保规划小助手),并引导我提出与社保规划相关的具体问题。\n"
94
+ )
95
+ ])
96
+
97
+
98
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
99
+
100
+ retrieval_pipeline = RunnableMap(
101
+ {
102
+ "question": lambda x: x["question"],
103
+ "user_info": lambda x: x["user_info"],
104
+ "chat_history": lambda x: x["chat_history"],
105
+ }
106
+ )
107
+ debug_step = RunnablePassthrough.assign(
108
+ debug_info=lambda x: print(f"Question type: {type(x['question'])}, content: {x}")
109
+ )
110
+
111
+ retrieval_step = RunnablePassthrough.assign(
112
+ docs=lambda x: multi_retriever.invoke(x["question"])
113
+ )
114
+ rerank_step = RunnablePassthrough.assign(
115
+ reranked_docs=lambda x: rerank(tokenizer, model, x["question"], x["docs"], 5)
116
+ )
117
+ context_step = RunnablePassthrough.assign(
118
+ context=lambda x: "\n".join([doc.page_content for doc in x["reranked_docs"]])
119
+ )
120
+
121
+ chain = (
122
+ retrieval_pipeline
123
+ | retrieval_step
124
+ | rerank_step
125
+ | context_step
126
+ | prompt
127
+ | llm
128
+ | StrOutputParser()
129
+ )
130
+
131
+ return chain, memory
132
+
133
+
134
+ if __name__ == "__main__":
135
+ from rag_chain import build_user_info, build_qa_chain, rerank
136
+ from get_client import get_client
137
+ from retriever_builder import process_pdfs_to_chunks, save_embeddings
138
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
139
+
140
+ llm = get_client()
141
+ pdf_paths = "../pdf_files"
142
+ all_chunks = process_pdfs_to_chunks(pdf_paths)
143
+ vectordb = save_embeddings(all_chunks,
144
+ persist_directory='../data_base/vector_db/chroma',
145
+ overwrite=False)
146
+ retriever = vectordb.as_retriever(search_kwargs={"k": 4})
147
+ tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
148
+ model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-base")
149
+
150
+ rag_chain, memory = build_qa_chain(
151
+ llm=llm,
152
+ multi_retriever=retriever,
153
+ model=model,
154
+ tokenizer=tokenizer
155
+ )
156
+
157
+ test_user_info = {
158
+ "job_info": "软件工程师",
159
+ "situation": "已入职试用期",
160
+ "city": "上海",
161
+ "age": 26,
162
+ }
163
+ test_question = "试用期公司会给我交社保和公积金吗?"
164
+
165
+ print("\n--- 正在执行 RAG 链 ---")
166
+
167
+ result = rag_chain.invoke({
168
+ "question": test_question,
169
+ "user_info": build_user_info(test_user_info),
170
+ "chat_history": [] # 假设是首次提问,聊天记录为空
171
+ })
172
+
173
+ print("\n--- 成功执行!链的最终输出如下 ---")
174
+ print(result)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ langchain-core
5
+ transformers
6
+ torch
7
+ pypdf
retriever_builder.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import List
4
+ from langchain.docstore.document import Document
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
10
+ from langchain_community.retrievers import BM25Retriever
11
+ from langchain.retrievers import EnsembleRetriever
12
+
13
+ def build_or_load_vectorstore(paths: list[str], embedding_model, persist_directory):
14
+ all_docs = []
15
+ pdf_paths = [
16
+ os.path.join(paths, fname)
17
+ for fname in os.listdir(paths)
18
+ if fname.lower().endswith(".pdf")
19
+ ]
20
+
21
+ for path in pdf_paths:
22
+ loader = PyPDFLoader(path)
23
+ docs = loader.load()
24
+ all_docs.extend(docs)
25
+
26
+ splitter = RecursiveCharacterTextSplitter(chunk_size=200,
27
+ chunk_overlap=50,
28
+ separators=["\n\n", "(?<=\。 )", "\n", " ", ""],
29
+ strip_whitespace=True
30
+ )
31
+ split_docs = splitter.split_documents(all_docs)
32
+
33
+ vectordb = Chroma.from_documents(documents=split_docs, embedding=embedding_model, persist_directory=persist_directory)
34
+ vectordb.persist()
35
+ return vectordb
36
+
37
+
38
+ def process_pdfs_to_chunks(paths: List[str],
39
+ chunk_size: int = 200,
40
+ chunk_overlap: int = 50) -> List[Document]:
41
+ # 初始化分割器
42
+ splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=chunk_size,
44
+ chunk_overlap=chunk_overlap,
45
+ separators=["\n\n", "(?<=\。 )", "\n", " ", ""],
46
+ strip_whitespace=True
47
+ )
48
+
49
+ all_chunks = []
50
+ pdf_paths = [
51
+ os.path.join(paths, fname)
52
+ for fname in os.listdir(paths)
53
+ if fname.lower().endswith(".pdf")
54
+ ]
55
+
56
+ for pdf_path in pdf_paths:
57
+ loader = PyPDFLoader(pdf_path)
58
+ pages = loader.load()
59
+
60
+ for page in pages:
61
+ chunks = splitter.split_text(page.page_content)
62
+
63
+ for chunk in chunks:
64
+ # 清洗:按行 strip,然后再合并为一段文本
65
+ cleaned_chunk_text = re.sub(r'\n\s+', '\n', chunk)
66
+ lines = cleaned_chunk_text.split('\n')
67
+ cleaned_chunk = [line.strip() for line in lines]
68
+ final_page_content = ' '.join(cleaned_chunk)
69
+
70
+ if final_page_content.strip(): # 非空才添加
71
+ all_chunks.append(Document(
72
+ page_content=final_page_content,
73
+ metadata={
74
+ "source": pdf_path,
75
+ "page": page.metadata.get("page", None)
76
+ }
77
+ ))
78
+
79
+ return all_chunks
80
+
81
+
82
+ def save_embeddings(all_chunks: List[Document] | None = None,
83
+ persist_directory: str = './data_base/vector_db/chroma',
84
+ overwrite=False):
85
+
86
+ embedding_model = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-zh")
87
+
88
+ if os.path.exists(persist_directory) and not overwrite:
89
+ print("===已存在向量库,跳过保存===")
90
+ return Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
91
+
92
+ vectorstore = Chroma.from_documents(
93
+ documents=all_chunks,
94
+ embedding=embedding_model,
95
+ persist_directory=persist_directory
96
+ )
97
+
98
+ vectorstore.persist()
99
+ print(f"===向量数据库已保存到:{persist_directory}===")
100
+
101
+ return vectorstore
102
+
103
+
104
+ def get_hybrid_retriever(all_chunks: List[Document],
105
+ persist_directory: str = './data_base/vector_db/chroma',
106
+ k: int = 4) -> EnsembleRetriever:
107
+ # 稠密 embedding
108
+ embedding_model = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-zh")
109
+
110
+ # 构建/加载 Chroma
111
+ vectorstore = Chroma.from_documents(
112
+ documents=all_chunks,
113
+ embedding=embedding_model,
114
+ persist_directory=persist_directory
115
+ )
116
+ vectorstore.persist()
117
+
118
+ dense_retriever = vectorstore.as_retriever(search_kwargs={"k": k})
119
+
120
+ # 构建稀疏 BM25
121
+ bm25_retriever = BM25Retriever.from_documents(all_chunks)
122
+ bm25_retriever.k = k
123
+
124
+ # 融合
125
+ hybrid_retriever = EnsembleRetriever(
126
+ retrievers=[dense_retriever, bm25_retriever],
127
+ weights=[0.4, 0.6]
128
+ )
129
+ return hybrid_retriever
130
+
131
+
132
+ if __name__ == "__main__":
133
+ embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-zh")
134
+ VECTORDB_DIR = "./data_base/vector_db/chroma/"
135
+
136
+ pdf_paths = "./pdf_files/"
137
+ all_chunks = process_pdfs_to_chunks(pdf_paths)
138
+ vectorstore = save_embeddings(all_chunks)
139
+
140
+ # 获取混合检索器
141
+ hybrid_retriever = get_hybrid_retriever(all_chunks)
142
+ print("===混合检索器已准备就绪===")
utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from retriever_builder import process_pdfs_to_chunks, save_embeddings
4
+ from conversation_manager import PlannerAgent
5
+
6
+ # 部署时解压pdf文件
7
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8
+ os.system(f"tar -xzvf /home/user/app/pdf_files.tar.gz")
9
+
10
+ # 初始化向量数据库
11
+ pdf_paths = "./pdf_files"
12
+ all_chunks = process_pdfs_to_chunks(pdf_paths)
13
+ vectordb = save_embeddings(all_chunks,
14
+ persist_directory='./data_base/vector_db/chroma',
15
+ overwrite=True) # 是否需要复写(是否有新增)
16
+ retriever = vectordb.as_retriever(search_kwargs={"k": 4})
17
+
18
+ # 用于缓存 agent 实例(支持多轮)
19
+ agent = PlannerAgent(retriever=retriever)
20
+
21
+ def classify_job_type(job_name: str) -> str:
22
+ job_name = job_name.lower()
23
+ job_name = job_name.split('(')[0]
24
+
25
+ # 新业态关键词
26
+ new_economy_jobs = ["外卖", "快递", "网约车", "主播", "骑手", "平台", "直播", "自媒体"]
27
+ # 灵活就业关键词
28
+ flexible_jobs = ["自由", "个体户", "兼职", "临时工", "接单", "顾问", "自由职业者"]
29
+ # 城镇职工关键词(白领/技术类等)
30
+ urban_jobs = ["公司", "企业", "工程师", "职员", "护士", "程序员"]
31
+ # 城乡居民关键词
32
+ rural_jobs = ["农民", "养殖户", "渔民", "果农", "农业工人", "林业工人", "乡村医生"]
33
+
34
+ for kw in new_economy_jobs:
35
+ if kw in job_name:
36
+ return "新业态就业"
37
+ for kw in flexible_jobs:
38
+ if kw in job_name:
39
+ return "灵活就业"
40
+ for kw in urban_jobs:
41
+ if kw in job_name:
42
+ return "城镇职工"
43
+ for kw in rural_jobs:
44
+ if kw in job_name:
45
+ return "城乡居民"
46
+ if job_name not in new_economy_jobs and job_name not in flexible_jobs and job_name not in urban_jobs:
47
+ return "其他"
48
+ # 默认值
49
+ return "其他"
50
+
51
+ def user_asks(message, history, *args):
52
+
53
+ user_goal = args[0] if len(args) > 0 else "未知"
54
+ job_input = args[1] if len(args) > 1 else "未知"
55
+ situation = args[2] if len(args) > 2 else "未知"
56
+ city = args[3] if len(args) > 3 else "未知"
57
+ age = args[4] if len(args) > 4 else "未知"
58
+ other_info = args[5] if len(args) > 5 else "未知"
59
+
60
+ # 自动分类
61
+ job_type = classify_job_type(job_input)
62
+
63
+ user_info = {
64
+ "goal": user_goal,
65
+ "job_info": job_input,
66
+ "job_type": job_type,
67
+ "city": city,
68
+ "age": age,
69
+ "situation": situation,
70
+ "other_info": other_info,
71
+ }
72
+
73
+ current_history = history + [{"role": "user", "content": message}, {"role": "assistant", "content": ""}]
74
+
75
+ # 显示用户消息,同时清空输入框
76
+ yield current_history, ""
77
+
78
+ full_response = ""
79
+
80
+ try:
81
+ for chunk in agent.stream_reply(message, user_info):
82
+
83
+ print(f"--- Received chunk from agent: '{chunk}' ---")
84
+ full_response += chunk
85
+ current_history[-1]["content"] = full_response
86
+ yield current_history, ""
87
+ print("--- Agent streaming loop finished ---")
88
+ except Exception as e:
89
+ print(f"!!! ERROR: An exception occurred during agent.stream_reply or its iteration: {e}")
90
+ import traceback
91
+ traceback.print_exc()
92
+ current_history[-1]["content"] = f"抱歉,系统内部发生错误,无法生成回复。错误详情:{e}"
93
+ yield current_history, ""
94
+ print("--- Exiting user_asks ---")
95
+