Spaces:
Paused
Paused
Upload 7 files
Browse files- RAG_Learning_Assistant_with_Streaming.py +515 -0
- knowledge_base.md +340 -0
- query_log.csv +122 -0
- requirements.txt +6 -0
- vector_database.csv +0 -0
- vector_metadata.json +0 -0
- vectorize_knowledge_base.py +584 -0
RAG_Learning_Assistant_with_Streaming.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from typing import List, Dict, Tuple, Generator, Set
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from vectorize_knowledge_base import KnowledgeBaseVectorizer
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class RAGLearningAssistant:
|
| 12 |
+
def __init__(self, api_key: str, model: str = "gpt-4.1-nano-2025-04-14", vector_db_path: str = ""):
|
| 13 |
+
"""
|
| 14 |
+
初始化RAG学习助手
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
api_key: OpenAI API密钥(必需)
|
| 18 |
+
model: 使用的模型名称
|
| 19 |
+
vector_db_path: 向量数据库路径
|
| 20 |
+
"""
|
| 21 |
+
self.client = OpenAI(api_key=api_key)
|
| 22 |
+
self.vectorizer = KnowledgeBaseVectorizer(
|
| 23 |
+
api_key=api_key,
|
| 24 |
+
#data_path=os.path.join(vector_db_path, "knowledge_base.md")
|
| 25 |
+
data_path="knowledge_base.md"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# 预加载向量数据库到缓存
|
| 29 |
+
print("预加载向量数据库...")
|
| 30 |
+
self.vectorizer.load_vector_database()
|
| 31 |
+
|
| 32 |
+
# 模型配置
|
| 33 |
+
self.model = model
|
| 34 |
+
self.temperature = 0.2
|
| 35 |
+
self.max_tokens = 2000
|
| 36 |
+
|
| 37 |
+
# 系统提示词
|
| 38 |
+
self.system_prompt = """You are a helpful learning assistant specializing in road engineering.
|
| 39 |
+
Students can ask you questions with the following intents:
|
| 40 |
+
1. Clarification: Requests to confirm understanding of a concept, parameter, or calculation.
|
| 41 |
+
2. Instruction: Seeking step-by-step guidance for tasks or calculations.
|
| 42 |
+
3. Explanatory: Asking for the reasoning behind a method, parameter choice, or principle.
|
| 43 |
+
4. Information-seeking: Asking for where to find specific information in course materials.
|
| 44 |
+
|
| 45 |
+
You have access to a knowledge base of course materials. When answering questions:
|
| 46 |
+
1. Stick to the provided context from the knowledge base.
|
| 47 |
+
2. At the end of your response, provide students the 'title' & 'from' fields of the chunks that were used to answer the question. So that they can refer to the original source.
|
| 48 |
+
3. If the knowledge base doesn't contain relevant information, say so. Students can go to the teaching team for further assistance.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# 查询重写的系统提示词 - 改进版本
|
| 52 |
+
self.rewrite_prompt = """You are a query rewriting assistant. Your task is to provide a summary of the conversation history and then rewrite user queries based on conversation history to make them more clear and complete.
|
| 53 |
+
|
| 54 |
+
Please format your response as follows:
|
| 55 |
+
SUMMARY: [Brief summary of the conversation context. Include key points, user intent, and any relevant details]
|
| 56 |
+
REWRITTEN_QUERY: [The rewritten query that incorporates context]
|
| 57 |
+
|
| 58 |
+
Rules:
|
| 59 |
+
1. If there's relevant context from previous messages, incorporate it into the rewritten query
|
| 60 |
+
2. Make implicit references explicit
|
| 61 |
+
3. Maintain the original intent while adding clarity
|
| 62 |
+
4. If the query is already clear and complete, keep it as is
|
| 63 |
+
5. Always provide both SUMMARY and REWRITTEN_QUERY sections"""
|
| 64 |
+
|
| 65 |
+
# 实体提取的系统提示词
|
| 66 |
+
self.entity_extraction_prompt = """You are an expert in road engineering. Extract key entities from the given query.
|
| 67 |
+
Focus on:
|
| 68 |
+
1. Technical terms and jargon specific to road engineering
|
| 69 |
+
2. Formulas, equations, or mathematical concepts
|
| 70 |
+
3. Parameters, specifications, or measurements
|
| 71 |
+
4. Standards, methods, or procedures
|
| 72 |
+
5. Materials, equipment, or structures
|
| 73 |
+
|
| 74 |
+
Return the entities as a JSON array of strings. Only include the most important and specific entities."""
|
| 75 |
+
|
| 76 |
+
# 对话历史
|
| 77 |
+
self.conversation_history = []
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def rewrite_query(self, query: str) -> Tuple[str, str]:
|
| 81 |
+
"""
|
| 82 |
+
基于对话历史重写查询,并返回历史总结
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
query: 原始查询
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
(历史总结, 重写后的查询)
|
| 89 |
+
"""
|
| 90 |
+
# 构建消息
|
| 91 |
+
messages = [
|
| 92 |
+
{"role": "system", "content": self.rewrite_prompt}
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
# 添加对话历史上下文
|
| 96 |
+
if self.conversation_history:
|
| 97 |
+
context = "Previous conversation:\n"
|
| 98 |
+
for msg in self.conversation_history[-6:]: # 最近3轮对话
|
| 99 |
+
role = "User" if msg["role"] == "user" else "Assistant"
|
| 100 |
+
# 截取前200个字符避免过长
|
| 101 |
+
content = msg["content"][:200] + "..." if len(msg["content"]) > 200 else msg["content"]
|
| 102 |
+
context += f"{role}: {content}\n"
|
| 103 |
+
|
| 104 |
+
messages.append({
|
| 105 |
+
"role": "user",
|
| 106 |
+
"content": f"{context}\n\nCurrent query: {query}\n\nPlease provide summary and rewritten query following the specified format:"
|
| 107 |
+
})
|
| 108 |
+
else:
|
| 109 |
+
# 没有历史时也要按格式返回
|
| 110 |
+
messages.append({
|
| 111 |
+
"role": "user",
|
| 112 |
+
"content": f"Current query: {query}\n\nPlease provide summary and rewritten query following the specified format:"
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
response = self.client.chat.completions.create(
|
| 117 |
+
model=self.model,
|
| 118 |
+
messages=messages,
|
| 119 |
+
temperature=0.3, # 低温度确保一致性
|
| 120 |
+
max_tokens=2000
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
content = response.choices[0].message.content.strip()
|
| 124 |
+
|
| 125 |
+
# 改进的解析逻辑
|
| 126 |
+
summary = ""
|
| 127 |
+
rewritten = query # 默认值
|
| 128 |
+
|
| 129 |
+
# 使用正则表达式提取SUMMARY和REWRITTEN_QUERY
|
| 130 |
+
summary_match = re.search(r'SUMMARY:\s*(.*?)(?=REWRITTEN_QUERY:|$)', content, re.DOTALL | re.IGNORECASE)
|
| 131 |
+
rewritten_match = re.search(r'REWRITTEN_QUERY:\s*(.*?)$', content, re.DOTALL | re.IGNORECASE)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
if summary_match:
|
| 135 |
+
summary = summary_match.group(1).strip()
|
| 136 |
+
|
| 137 |
+
if rewritten_match:
|
| 138 |
+
rewritten = rewritten_match.group(1).strip()
|
| 139 |
+
|
| 140 |
+
# 备用解析方法 - 如果正则表达式失败
|
| 141 |
+
if not summary and not rewritten_match:
|
| 142 |
+
lines = content.split('\n')
|
| 143 |
+
current_section = None
|
| 144 |
+
summary_lines = []
|
| 145 |
+
rewritten_lines = []
|
| 146 |
+
|
| 147 |
+
for line in lines:
|
| 148 |
+
line = line.strip()
|
| 149 |
+
if line.upper().startswith("SUMMARY"):
|
| 150 |
+
current_section = "summary"
|
| 151 |
+
# 提取SUMMARY:后面的内容
|
| 152 |
+
summary_part = line[line.upper().find("SUMMARY"):].replace("SUMMARY:", "").strip()
|
| 153 |
+
if summary_part:
|
| 154 |
+
summary_lines.append(summary_part)
|
| 155 |
+
elif line.upper().startswith("REWRITTEN_QUERY") or line.upper().startswith("REWRITTEN QUERY"):
|
| 156 |
+
current_section = "rewritten"
|
| 157 |
+
# 提取REWRITTEN_QUERY:后面的内容
|
| 158 |
+
rewritten_part = re.sub(r'^REWRITTEN[_\s]*QUERY[:\s]*', '', line, flags=re.IGNORECASE).strip()
|
| 159 |
+
if rewritten_part:
|
| 160 |
+
rewritten_lines.append(rewritten_part)
|
| 161 |
+
elif current_section == "summary" and line:
|
| 162 |
+
summary_lines.append(line)
|
| 163 |
+
elif current_section == "rewritten" and line:
|
| 164 |
+
rewritten_lines.append(line)
|
| 165 |
+
|
| 166 |
+
if summary_lines:
|
| 167 |
+
summary = " ".join(summary_lines)
|
| 168 |
+
if rewritten_lines:
|
| 169 |
+
rewritten = " ".join(rewritten_lines)
|
| 170 |
+
|
| 171 |
+
# 如果仍然没有获得有效结果,使用更简单的方法
|
| 172 |
+
if not summary and self.conversation_history:
|
| 173 |
+
summary = "继续之前的讨论"
|
| 174 |
+
|
| 175 |
+
if not rewritten or rewritten == query:
|
| 176 |
+
rewritten = query
|
| 177 |
+
|
| 178 |
+
print(f"Raw query: {query}")
|
| 179 |
+
print(f"Chat history summary: {summary}")
|
| 180 |
+
print(f"Rewrite query: {rewritten}")
|
| 181 |
+
return summary, rewritten
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"查询重写失败: {e}")
|
| 185 |
+
# 生成简单的历史总结作为备用
|
| 186 |
+
simple_summary = ""
|
| 187 |
+
if self.conversation_history:
|
| 188 |
+
simple_summary = "基于之前的对话内容"
|
| 189 |
+
return simple_summary, query # 失败时返回简单总结和原始查询
|
| 190 |
+
|
| 191 |
+
def extract_entities(self, original_query: str, summary: str, rewritten_query: str) -> List[str]:
|
| 192 |
+
"""
|
| 193 |
+
从原始查询、历史总结和重写查询中提取关键实体(专业术语、公式、参数等)
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
original_query: 原始用户查询
|
| 197 |
+
summary: 历史总结
|
| 198 |
+
rewritten_query: 重写后的查询文本
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
提取的实体列表
|
| 202 |
+
"""
|
| 203 |
+
# 合并所有文本作为实体提取的输入
|
| 204 |
+
text_parts = []
|
| 205 |
+
|
| 206 |
+
# 添加原始查询
|
| 207 |
+
if original_query:
|
| 208 |
+
text_parts.append(f"Original query: {original_query}")
|
| 209 |
+
|
| 210 |
+
# 添加历史总结
|
| 211 |
+
if summary:
|
| 212 |
+
text_parts.append(f"Context summary: {summary}")
|
| 213 |
+
|
| 214 |
+
# 添加重写查询
|
| 215 |
+
if rewritten_query and rewritten_query != original_query:
|
| 216 |
+
text_parts.append(f"Rewritten query: {rewritten_query}")
|
| 217 |
+
|
| 218 |
+
combined_text = " | ".join(text_parts)
|
| 219 |
+
|
| 220 |
+
messages = [
|
| 221 |
+
{"role": "system", "content": self.entity_extraction_prompt},
|
| 222 |
+
{"role": "user", "content": f"Text to extract entities from: {combined_text}\n\nExtract entities as JSON array:"}
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
response = self.client.chat.completions.create(
|
| 227 |
+
model=self.model,
|
| 228 |
+
messages=messages,
|
| 229 |
+
temperature=0.3,
|
| 230 |
+
max_tokens=200
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
content = response.choices[0].message.content.strip()
|
| 234 |
+
|
| 235 |
+
# 尝试解析JSON
|
| 236 |
+
try:
|
| 237 |
+
# 提取JSON数组(处理可能的markdown格式)
|
| 238 |
+
json_match = re.search(r'\[.*?\]', content, re.DOTALL)
|
| 239 |
+
if json_match:
|
| 240 |
+
entities = json.loads(json_match.group())
|
| 241 |
+
else:
|
| 242 |
+
entities = json.loads(content)
|
| 243 |
+
|
| 244 |
+
print(f"Extracted entities: {entities}")
|
| 245 |
+
return entities
|
| 246 |
+
|
| 247 |
+
except json.JSONDecodeError:
|
| 248 |
+
# 如果JSON解析失败,尝试简单的文本处理
|
| 249 |
+
print(f"JSON解析失败,使用备用方法")
|
| 250 |
+
# 查找引号中的内容
|
| 251 |
+
entities = re.findall(r'"([^"]+)"', content)
|
| 252 |
+
return entities if entities else self.simple_entity_extraction(combined_text)
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"实体提取失败: {e}")
|
| 256 |
+
# 失败时使用简单的关键词提取
|
| 257 |
+
return self.simple_entity_extraction(combined_text)
|
| 258 |
+
|
| 259 |
+
def simple_entity_extraction(self, query: str) -> List[str]:
|
| 260 |
+
"""
|
| 261 |
+
简单的实体提取备用方法
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
query: 查询文本
|
| 265 |
+
|
| 266 |
+
Returns:
|
| 267 |
+
提取的关键词列表
|
| 268 |
+
"""
|
| 269 |
+
# 移除常见停用词
|
| 270 |
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 271 |
+
'of', 'with', 'by', 'from', 'what', 'how', 'when', 'where', 'why',
|
| 272 |
+
'is', 'are', 'was', 'were', 'been', 'be', 'have', 'has', 'had',
|
| 273 |
+
'original', 'query', 'context', 'summary', 'rewritten'} # 添加新的停用词
|
| 274 |
+
|
| 275 |
+
# 分词并过滤
|
| 276 |
+
words = query.lower().split()
|
| 277 |
+
entities = [w for w in words if w not in stop_words and len(w) > 2]
|
| 278 |
+
|
| 279 |
+
# 查找可能的专业术语(包含大写字母或数字)
|
| 280 |
+
special_terms = re.findall(r'\b[A-Z][a-zA-Z]*\b|\b\w*\d+\w*\b', query)
|
| 281 |
+
entities.extend(special_terms)
|
| 282 |
+
|
| 283 |
+
# 去重并返回
|
| 284 |
+
return list(set(entities))[:5] # 最多返回5个实体
|
| 285 |
+
|
| 286 |
+
def enhanced_search(self, query: str, top_k: int = 3) -> Tuple[str, str, List[str], List[Tuple[Dict, float, Dict]]]:
|
| 287 |
+
"""
|
| 288 |
+
增强搜索:重写查询 -> 提取实体 -> 基于实体搜索(优化版本)
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
query: 原始查询
|
| 292 |
+
top_k: 返回的结果数
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
(历史总结, 重写后的查询, 提取的实体, 搜索结果)
|
| 296 |
+
"""
|
| 297 |
+
# 1. 重写查询并获取历史总结
|
| 298 |
+
summary, rewritten_query = self.rewrite_query(query)
|
| 299 |
+
|
| 300 |
+
# 2. 基于原始查询、总结和重写查询提取实体
|
| 301 |
+
entities = self.extract_entities(query, summary, rewritten_query)
|
| 302 |
+
|
| 303 |
+
# 3. 基于实体搜索(使用优化的批量搜索)
|
| 304 |
+
if entities:
|
| 305 |
+
# 使用优化的批量搜索方法
|
| 306 |
+
search_results = self.vectorizer.search_with_entities_optimized(entities, top_k)
|
| 307 |
+
else:
|
| 308 |
+
# 如果没有提取到实体,使用重写后的查询进行搜索
|
| 309 |
+
print("未提取到实体,使用完整查询搜索")
|
| 310 |
+
search_results = self.vectorizer.search_similar(
|
| 311 |
+
rewritten_query,
|
| 312 |
+
top_k=top_k,
|
| 313 |
+
title_weight=0.4,
|
| 314 |
+
content_weight=0.3,
|
| 315 |
+
full_weight=0.3
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
return summary, rewritten_query, entities, search_results
|
| 319 |
+
|
| 320 |
+
def format_context(self, search_results: List[Tuple[Dict, float, Dict]]) -> str:
|
| 321 |
+
"""
|
| 322 |
+
格式化搜索结果作为上下文
|
| 323 |
+
|
| 324 |
+
Args:
|
| 325 |
+
search_results: 搜索结果列表
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
格式化的上下文字符串
|
| 329 |
+
"""
|
| 330 |
+
if not search_results:
|
| 331 |
+
return ""
|
| 332 |
+
|
| 333 |
+
context_parts = []
|
| 334 |
+
for i, result in enumerate(search_results, 1):
|
| 335 |
+
entry, combined_score, details = result
|
| 336 |
+
# 只显示 title, source, content,不显示 id
|
| 337 |
+
context_parts.append(
|
| 338 |
+
#f"[Source {i}]\n"
|
| 339 |
+
f"Title: {entry['title']}\n"
|
| 340 |
+
f"From: {entry['source']}\n"
|
| 341 |
+
f"Content: {entry['content']}\n"
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
return "RELEVANT KNOWLEDGE BASE CONTENT:\n" + "\n---\n".join(context_parts)
|
| 345 |
+
|
| 346 |
+
def build_messages(self, query: str, context: str) -> List[Dict[str, str]]:
|
| 347 |
+
"""
|
| 348 |
+
构建消息列表,包含系统提示、上下文和用户查询
|
| 349 |
+
|
| 350 |
+
Args:
|
| 351 |
+
query: 用户查询
|
| 352 |
+
context: 知识库上下文
|
| 353 |
+
|
| 354 |
+
Returns:
|
| 355 |
+
消息列表
|
| 356 |
+
"""
|
| 357 |
+
messages = [
|
| 358 |
+
{"role": "system", "content": self.system_prompt}
|
| 359 |
+
]
|
| 360 |
+
|
| 361 |
+
# 添加对话历史(保留最近5轮对话)
|
| 362 |
+
for msg in self.conversation_history[-10:]: # 最多保留5轮对话(10条消息)
|
| 363 |
+
messages.append(msg)
|
| 364 |
+
|
| 365 |
+
# 构建用户消息,包含上下文
|
| 366 |
+
user_message = query
|
| 367 |
+
if context:
|
| 368 |
+
user_message = f"{context}\n\nUSER QUESTION: {query}"
|
| 369 |
+
|
| 370 |
+
messages.append({"role": "user", "content": user_message})
|
| 371 |
+
|
| 372 |
+
return messages
|
| 373 |
+
|
| 374 |
+
def generate_response_stream(self, query: str) -> Generator[str, None, None]:
|
| 375 |
+
"""
|
| 376 |
+
生成流式响应
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
query: 用户查询
|
| 380 |
+
|
| 381 |
+
Yields:
|
| 382 |
+
响应文本片段
|
| 383 |
+
"""
|
| 384 |
+
# 1. 增强搜索(现在使用优化版本)
|
| 385 |
+
print("正在处理查询...")
|
| 386 |
+
summary, rewritten_query, entities, search_results = self.enhanced_search(query)
|
| 387 |
+
|
| 388 |
+
# 2. 格式化上下文
|
| 389 |
+
context = self.format_context(search_results)
|
| 390 |
+
|
| 391 |
+
# 3. 构建消息(使用原始查询,但包含基于实体搜索的上下文)
|
| 392 |
+
messages = self.build_messages(query, context)
|
| 393 |
+
|
| 394 |
+
# 4. 调用OpenAI API进行流式生成
|
| 395 |
+
try:
|
| 396 |
+
stream = self.client.chat.completions.create(
|
| 397 |
+
model=self.model,
|
| 398 |
+
messages=messages,
|
| 399 |
+
temperature=self.temperature,
|
| 400 |
+
max_tokens=self.max_tokens,
|
| 401 |
+
stream=True
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
# 收集完整响应用于保存到历史
|
| 405 |
+
full_response = ""
|
| 406 |
+
|
| 407 |
+
# 首先返回搜索信息
|
| 408 |
+
search_info = f"\n**Query Analysis:**\n"
|
| 409 |
+
search_info += f"- Query: {query}\n"
|
| 410 |
+
if summary:
|
| 411 |
+
search_info += f"- Summary of history: {summary}\n"
|
| 412 |
+
if rewritten_query != query:
|
| 413 |
+
search_info += f"- Rewrite query: {rewritten_query}\n"
|
| 414 |
+
search_info += f"- Key entities: {', '.join(entities) if entities else 'No specific entities extracted'}\n"
|
| 415 |
+
|
| 416 |
+
if search_results:
|
| 417 |
+
search_info += f"\n**Relevant Sources:**\n"
|
| 418 |
+
for result in search_results:
|
| 419 |
+
entry, combined_score, details = result
|
| 420 |
+
# 给用户显示时包含 ID 和相关度分数
|
| 421 |
+
search_info += f"- [{entry['id']}] {entry['title']} (Relevance: {combined_score:.3f})\n"
|
| 422 |
+
search_info += "\n**Response:**\n"
|
| 423 |
+
else:
|
| 424 |
+
search_info += "\n**Response:** (No relevant knowledge base content found, answering based on general knowledge)\n"
|
| 425 |
+
|
| 426 |
+
# 添加缓存信息(调试用)
|
| 427 |
+
cache_info = self.vectorizer.get_cache_info()
|
| 428 |
+
if cache_info['is_cached']:
|
| 429 |
+
search_info += f"The vector db has been cached, containing {cache_info['cache_size']} entries\n\n"
|
| 430 |
+
|
| 431 |
+
yield search_info
|
| 432 |
+
|
| 433 |
+
# 流式返回生成的内容
|
| 434 |
+
for chunk in stream:
|
| 435 |
+
if chunk.choices[0].delta.content is not None:
|
| 436 |
+
content = chunk.choices[0].delta.content
|
| 437 |
+
full_response += content
|
| 438 |
+
yield content
|
| 439 |
+
|
| 440 |
+
# 保存到对话历史
|
| 441 |
+
self.conversation_history.append({"role": "user", "content": query})
|
| 442 |
+
self.conversation_history.append({"role": "assistant", "content": full_response})
|
| 443 |
+
|
| 444 |
+
except Exception as e:
|
| 445 |
+
yield f"\n\n错误:生成响应时出现问题 - {str(e)}"
|
| 446 |
+
|
| 447 |
+
def generate_response(self, query: str) -> str:
|
| 448 |
+
"""
|
| 449 |
+
生成完整响应(非流式)
|
| 450 |
+
|
| 451 |
+
Args:
|
| 452 |
+
query: 用户查询
|
| 453 |
+
|
| 454 |
+
Returns:
|
| 455 |
+
完整的响应文本
|
| 456 |
+
"""
|
| 457 |
+
response_parts = []
|
| 458 |
+
for part in self.generate_response_stream(query):
|
| 459 |
+
response_parts.append(part)
|
| 460 |
+
return "".join(response_parts)
|
| 461 |
+
|
| 462 |
+
def clear_history(self):
|
| 463 |
+
"""清除对话历史"""
|
| 464 |
+
self.conversation_history = []
|
| 465 |
+
print("对话历史已清除")
|
| 466 |
+
|
| 467 |
+
def clear_vector_cache(self):
|
| 468 |
+
"""清除向量数据库缓存"""
|
| 469 |
+
self.vectorizer.clear_cache()
|
| 470 |
+
print("向量数据库缓存已清除")
|
| 471 |
+
|
| 472 |
+
def reload_vector_database(self):
|
| 473 |
+
"""重新加载向量数据库"""
|
| 474 |
+
print("重新加载向量数据库...")
|
| 475 |
+
self.vectorizer.load_vector_database(force_reload=True)
|
| 476 |
+
print("向量数据库重新加载完成")
|
| 477 |
+
|
| 478 |
+
def get_system_status(self) -> Dict:
|
| 479 |
+
"""
|
| 480 |
+
获取系统状态信息
|
| 481 |
+
|
| 482 |
+
Returns:
|
| 483 |
+
系统状态字典
|
| 484 |
+
"""
|
| 485 |
+
cache_info = self.vectorizer.get_cache_info()
|
| 486 |
+
return {
|
| 487 |
+
'model': self.model,
|
| 488 |
+
'conversation_turns': len(self.conversation_history) // 2,
|
| 489 |
+
'vector_cache': cache_info,
|
| 490 |
+
'last_update': datetime.now().isoformat()
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
def save_conversation(self, filepath: str = None):
|
| 494 |
+
"""
|
| 495 |
+
保存对话历史
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
filepath: 保存路径
|
| 499 |
+
"""
|
| 500 |
+
if filepath is None:
|
| 501 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 502 |
+
filepath = f"conversation_{timestamp}.json"
|
| 503 |
+
|
| 504 |
+
conversation_data = {
|
| 505 |
+
"timestamp": datetime.now().isoformat(),
|
| 506 |
+
"model": self.model,
|
| 507 |
+
"system_status": self.get_system_status(),
|
| 508 |
+
"history": self.conversation_history
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 512 |
+
json.dump(conversation_data, f, ensure_ascii=False, indent=2)
|
| 513 |
+
|
| 514 |
+
print(f"对话已保存到: {filepath}")
|
| 515 |
+
|
knowledge_base.md
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 00-01-00 What this unit is about
|
| 2 |
+
**source** unit infomation on Moodle
|
| 3 |
+
**content** This unit introduces the fundamentals of road engineering theory and practice. It examines a number of issues related to the planning, geometric and structural design of roads, with due consideration to relevant economic, social and environmental aspects.
|
| 4 |
+
|
| 5 |
+
# 00-02-00 Expected learning outcomes
|
| 6 |
+
**source** unit infomation on Moodle
|
| 7 |
+
**content** On successful completion of this unit, you should be able to:
|
| 8 |
+
- 01 Design road geometry with due consideration to horizontal and vertical alignment, cross-section profile, earthworks and cost implications.
|
| 9 |
+
- 02 Assess different components of a road project, within the context of a safe systems approach to design, based on economic, social and environmental considerations.
|
| 10 |
+
- 03 Design flexible pavements giving consideration to economic, social and environmental factors.
|
| 11 |
+
|
| 12 |
+
# 00-03-00 Checklist of important items you should do
|
| 13 |
+
**source** learning on Moodle - getting started
|
| 14 |
+
**content** All listed items are on Moodle.
|
| 15 |
+
- The recommended readings, located in the "Additional information and resources" page (under "Learning" tag).
|
| 16 |
+
- The forums, located in the "Forums" page.
|
| 17 |
+
- Important assessments, located in the "Assessments" page.
|
| 18 |
+
- Link for online workshops, located in the "Getting Started" page.
|
| 19 |
+
- The content for each week, located in each week's pages.
|
| 20 |
+
- Sign up with a group for your project assignment. See the "Week 13 - Project Assignment" page.
|
| 21 |
+
|
| 22 |
+
# 00-04-00 Timetable and location for class
|
| 23 |
+
**source** learning on Moodle - getting started
|
| 24 |
+
**content** - Workshop: Monday 7pm-9pm, Learning and Teaching Building, G81 (main) and G56 (overflow)
|
| 25 |
+
- Practice Session 1, G29/33 B72, Wednesday 10am-12pm
|
| 26 |
+
- Practice Session 2, G29/33 B72, Wednesday 1pm-3pm
|
| 27 |
+
Please attend the practice session allocated in your timetable. Attendance will be marked.
|
| 28 |
+
|
| 29 |
+
# 00-05-00 Online link for class
|
| 30 |
+
**source** learning on Moodle - getting started
|
| 31 |
+
**content** - Workshop: Please click this URL to start or join: https://monash.zoom.us/j/82685794107?pwd=hHFu5jrl5YBAKQS60FU5bsMdBONox5.1. Or, go to https://monash.zoom.us/join and enter meeting ID: 826 8579 4107 and passcode: 209261
|
| 32 |
+
- Practice Session: In-person only.
|
| 33 |
+
|
| 34 |
+
# 00-06-00 Overview for assessment
|
| 35 |
+
**source** learning on Moodle - getting started
|
| 36 |
+
**content** This unit has two general assessment tasks: continuous assessment and final assessment.
|
| 37 |
+
For continuous assessment, there are three components, 60%:
|
| 38 |
+
- Assignment: Road design project, 45%
|
| 39 |
+
- Geometric design (stage 1, stage 2), 10%+25%
|
| 40 |
+
- Pavement design (stage 3), 10%
|
| 41 |
+
- Mid-term quiz, 10%
|
| 42 |
+
- Participation, 5%
|
| 43 |
+
- Attend practice classes, 2% (TA will mark your attendance)
|
| 44 |
+
- Participate in activities, 2% (do pre-class quiz and contribute to your group works)
|
| 45 |
+
- Active engagement, 1% (e.g. effectively help resolve questions/problems in discussion forums)
|
| 46 |
+
|
| 47 |
+
For final assessment, there is a final exam, 40%
|
| 48 |
+
|
| 49 |
+
# 00-07-00 Unit schedule and class info
|
| 50 |
+
**source** learning on Moodle - getting started
|
| 51 |
+
**content** The following table shows the schedule of this unit. Note that the schedule of guest lectures are subject to change.
|
| 52 |
+
| Week | Topic | Notes | Milestones |
|
| 53 |
+
|---|---|---|---|
|
| 54 |
+
| Week 1 | Road System | A guest lecture will be given on Safe System Approach by Mr. Ross Thompson from TraffixGroup | Team be formed by Week 1 |
|
| 55 |
+
| Week 2 | Road planning | | |
|
| 56 |
+
| Week 3 | Road Design Considerations | A guest lecture will be given on "North East Link Project" by colleagues from SMEC. <br>A prac activity will be led by colleagues from SMEC on "Multi-criteria analysis" | Registration close for “Future Road” competition by Traffic Accident Commission, Victoria |
|
| 57 |
+
| Week 4 | Road Geometry Design 1 | | Road Design Project<br>Stage 1 submission DUE |
|
| 58 |
+
| Week 5 | Road Geometry Design 2 | A guest lecture will be given on “geometric design” by Mr. Malcom Mak from Australia Road Research Board (ARRB) | |
|
| 59 |
+
| Week 6 | Road Geometry Design 3 | | |
|
| 60 |
+
| Week 7 | Road Geometry Design 4 | | |
|
| 61 |
+
| Week 8 | Road Geometry Design 5 | A guest lecture and a prac activity will be led by colleagues from SMEC on “detailed geometric design” | |
|
| 62 |
+
| Week 9 | Road Pavement Design 1 | A guest lecture will be given on “pavement materials” by Dr.Tareq and Dr. Shan from Department Transport Planning, Victoria | Road Design Project<br>Stage 2 submission DUE |
|
| 63 |
+
| | Semester Break | | |
|
| 64 |
+
| Week 10 | Road Pavement Design 2 | A guest lecture will be given on weigh-in-motion technology and its application in pavement design | Mid-term Quiz during workshop session |
|
| 65 |
+
| Week 11 | Road Pavement Design 3 | A guest lecture will be given on the Austroads design methods by Prof. Jayantha Kodikara from SPARC Hub Monash University | |
|
| 66 |
+
| Week 12 | Unit Summary | | Road Design Project<br>Stage 3 submission DUE |
|
| 67 |
+
|
| 68 |
+
# 00-08-00 Unit contacts
|
| 69 |
+
**source** learning on Moodle - getting started
|
| 70 |
+
**content** - For technical matters such as knowledge points and practice questions, please use the appropriate forum.
|
| 71 |
+
- For administrative matters such as grouping and Moodle operations, please reach out to Akbar (akbar.rizqiansyah@monash.edu)
|
| 72 |
+
- For “Future Road” competition and mentoring, please contact Nan (nan.zheng@monash.edu)
|
| 73 |
+
- For other inquiries, please contact either Akbar and/or Nan.
|
| 74 |
+
|
| 75 |
+
# 01-01-00 Learning objectives of week 1
|
| 76 |
+
**source** learning on Moodle - week 1 road system
|
| 77 |
+
**content** - Outline some historical developments in road design
|
| 78 |
+
- Understand the role of road design and classification of roads (road and street hierarchies)
|
| 79 |
+
- Discuss road design considerations
|
| 80 |
+
- Understand how the Safe System approach influence road design
|
| 81 |
+
|
| 82 |
+
# 01-02-00 Modern road design
|
| 83 |
+
**source** W 1.1 Introduction to Road Engineering, page 6
|
| 84 |
+
**content** The process of designing and managing the safe and efficient movement of people and goods on the road network. The key points are to design and manage the road network. It is a tradeoff between the needs to develop the network that is safe but also efficient. These can often be competing demands, particularly when we discuss travel speed. We also need to consider the movement of not only poeple but also ranges of goods on the road network. In particular, we need to consider the range of vehicles, from bicycles, motorcycles, private cars, through the large vehicles.
|
| 85 |
+
|
| 86 |
+
# 01-03-00 Road design process
|
| 87 |
+
**source** W 1.1 Introduction to Road Engineering, page 7
|
| 88 |
+
**content** Road design is only one component of the transportation engineering system. From diagram [image/01-03-00 Road design process.jpg], the focuses of this unit, mostly, are Geometric and Pavements Design. We will touch a little of Drainage and Structures. We will also briefly consider some of the other elements in this framework.
|
| 89 |
+
|
| 90 |
+
# 01-04-00 Benefits of a good road network
|
| 91 |
+
**source** W 1.1 Introduction to Road Engineering, page 8
|
| 92 |
+
**content** Why do we need to plan our road network?There are many benefits associated with the developing road network that operates efficiently and allow people and goods to move freely. The
|
| 93 |
+
- The road network can provide easy access to commercial, retail and recreational activities.
|
| 94 |
+
- This can stimulate economic growth.
|
| 95 |
+
- When designed well, the road network can improve safety, access and efficiency.
|
| 96 |
+
- Thus to improve road freight productivity.
|
| 97 |
+
- And improve public transport operating conditions.
|
| 98 |
+
|
| 99 |
+
# 01-05-00 Road design considerations
|
| 100 |
+
**source** W 1.1 Introduction to Road Engineering, page 9
|
| 101 |
+
**content** In order to develop a good road network, we need to make sure we take into account key road design considerations.
|
| 102 |
+
- three key things of road design: Safety, efficiency and accessibility issues for road users. We need to make tradeoffs between these three sometime competing needs. For example, it may be more efficient to let vehicles travel at higher speed. This can have safety issues because crashes that occur at higher speed are likely to cause more severe injuries. At the same time, it may not be practical to provide adjacent land use if travel speed is set too high.
|
| 103 |
+
- Performance and types of vehicles (traffic). Design not just for cars, but also for large truck and small or more vulnerable road users such as bicycles.
|
| 104 |
+
- Behaviour of road users. To understand human factors and how they need to be consider.
|
| 105 |
+
- Geometry of the road and limitations (vertical and horizontal profile of the road).
|
| 106 |
+
- Impact of road on environment
|
| 107 |
+
- Economics – road cost/efficiency, economic benefits associated with construction and management
|
| 108 |
+
|
| 109 |
+
# 01-06-00 Road context in Road traffic design system
|
| 110 |
+
**source** W 1.2 The Road Traffic Design System, page 5
|
| 111 |
+
**content** Road is not isolated from the rest of the world. And often, the road design decisions are influenced by range of other factors including political, institutional, and planning considerations. These factors need to be considered in almost every road design projects.
|
| 112 |
+
When we look at road design, some of the key considerations include:
|
| 113 |
+
- How much is spent
|
| 114 |
+
- What gets done, and where,
|
| 115 |
+
- Who designs and builds the road project.
|
| 116 |
+
|
| 117 |
+
These considerations are determined by:
|
| 118 |
+
- Political process (safety, environmental and social impacts & election)
|
| 119 |
+
- Institutional factors (set up by politicians to build and design roads)
|
| 120 |
+
- Planning process (not designed to give one result that must be followed)
|
| 121 |
+
|
| 122 |
+
These factors may not be very technical. But in practice, senior engineers might spend much of their time working on this.
|
| 123 |
+
|
| 124 |
+
# 01-07-00 Classification of roads
|
| 125 |
+
**source** W 1.2 The Road Traffic Design System, page 6
|
| 126 |
+
**content** Road networks fulfill four basic functions:
|
| 127 |
+
- Caters for moving passenger and freight traffic,
|
| 128 |
+
- Caters for parked vehicles,
|
| 129 |
+
- Caters for pedestrians and non-vehicular traffic and
|
| 130 |
+
- Allows development and access to adjacent land use.
|
| 131 |
+
|
| 132 |
+
This functions are inconsistent and competing. For example, a road suit for carrying large volumne of traffic may not be suited for pedestrians or providing convenient access. Thus, it is necessary to have various types of road, each serving predominent functions.
|
| 133 |
+
|
| 134 |
+
# 01-08-00 Road functions
|
| 135 |
+
**source** W 1.2 The Road Traffic Design System, page 7
|
| 136 |
+
**content** Roads have two main functions: movement and access.
|
| 137 |
+
Movement involves the movement of people or goods.
|
| 138 |
+
Access sometimes refers to activity that looks for other functions that the road might facilitate such as providing access to shops, houses, parking, or other facilites in the road reserve.
|
| 139 |
+
|
| 140 |
+
# 01-09-00 Road hierarchies and functional classification
|
| 141 |
+
**source** W 1.2 The Road Traffic Design System, page 8-9
|
| 142 |
+
**content** Based on the two basic funcions of road, Movement & Access, the diagram [image/01-09-00 Road hierarchies and functional classification.jpg] shows the classification based on movement v.s. access function. Here is the more detailed description of each range of different road types.
|
| 143 |
+
- Freeway:Those roads with full access control and grade separated instructions, whose primary function is to service large traffic movements.
|
| 144 |
+
- Primary Arterials: Those arterial roads whose function is to form the principle avenue for communication for metropolitan traffic movementsnot catered by for freeways.
|
| 145 |
+
- Secondary Arterials: Those roads which supplement the primary arterial roads in providing for through traffic movement, to an individually determined limit that is sensitive to both roadway characteristics and abutting land uses.
|
| 146 |
+
- Collector Roads: Those non-arterial roads which distribute traffic between the arterial roads and the local street system which provide local connection between arterial roads and which provide accessto abutting property.
|
| 147 |
+
- Local Access Streets: Those streets not being arterials or collectors whose main function is to provide accessto abutting property.
|
| 148 |
+
|
| 149 |
+
# 01-10-00 Other road classifications
|
| 150 |
+
**source** W 1.2 The Road Traffic Design System, page 11
|
| 151 |
+
**content** Two other consideration for classifying roads:
|
| 152 |
+
- Administration (who is responsible for the road)
|
| 153 |
+
- State Goverment (major roads), e.g., major roads/freeways in Victoria state
|
| 154 |
+
- Local Goverment (local roads) fund and maintained by local councils
|
| 155 |
+
- Funding
|
| 156 |
+
- Federal funds portion of roads of “major economic benefit” (major projects, e.g., a new freeway)
|
| 157 |
+
- State covers the rest of funding of major roads
|
| 158 |
+
- Local government / developers fund local roads
|
| 159 |
+
|
| 160 |
+
# 01-11-00 brief introduction of design components
|
| 161 |
+
**source** W 1.2 The Road Traffic Design System, page 13
|
| 162 |
+
**content** There are three components in road design: humans, vehicles, and road itself. The characteristics impact the design. There are two critical issues: Range of characteristics & The operating (traffic) environment.
|
| 163 |
+
Looking at range of characteristics, road design involves the standard input values, such as: driver eye height, vehicle performance, speed, and reaction time. These are given in design manuals. However, in reality, none of the characteristics is a single fixed value. And all of them distributed across ranges of values. Clearly, we should not design for the average condition, which leaves half of population fail. However, on the other hand, we typically do not design for 100% of the population, which will cause overdesign. For instance, for driver eye height, it is very uncommon that someone drive in a low sport car travelling at a very high speed and have incrediblly short reaction time and brake distance. So, it is not necessary to use all extreme values. Therefore, use 85th percentile values in general design.
|
| 164 |
+
The other system input is to realise the road vehicle driver system must be capable of operating in a range of environmental conditions. The enviroment in which traffic movemnet occurs has an important affect of both design and operation of the traffic system. It includes a range of physical, traffic, and social environment. The following table shows the factors:
|
| 165 |
+
| Operating environment | Factors |
|
| 166 |
+
|---|---|
|
| 167 |
+
| Physical environment | day and night (light conditions), the terrain, weather conditions, abutting development (construction cost and standard to use) |
|
| 168 |
+
| Traffic environment | traffic flow (daily and hourly), directional split, traffic composition, seasonality of traffic demands (e.g., recreational locations) |
|
| 169 |
+
| Social environment | driver's understanding of regulations (e.g., the system should run under consistent rules), community attitudes to driver responsibility, degree of enforcement by police |
|
| 170 |
+
|
| 171 |
+
# 01-12-00 brief overview of design process
|
| 172 |
+
**source** W 1.2 The Road Traffic Design System, page 17
|
| 173 |
+
**content** The objectives of design process are: Safety, Efficiency, and Amenity. We can call it SEA.
|
| 174 |
+
Safety: involves providing a safe facility for both users and society as a whole. It should always be checked
|
| 175 |
+
Efficiency: related to the cost of providing, maintaining, and using the road. e.g., If we were to design a bypass of a rural town, the efficiency objetive must ensure the cost building and maintaining the road, cost saving, travel time, running cost, and social & environmental cost of the project as such the project is efficient in use resources that can be used in other places.
|
| 176 |
+
Amenity: for both road users and non-road users.
|
| 177 |
+
|
| 178 |
+
# 01-13-00 brief overview of design stages
|
| 179 |
+
**source** W 1.2 The Road Traffic Design System, page 18
|
| 180 |
+
**content** Here is a simple structure of a design project.
|
| 181 |
+
- Conceptual design (Phase 0)
|
| 182 |
+
- Low details
|
| 183 |
+
- Large number of alternatives
|
| 184 |
+
- e.g., if we consider a highway bypass of a rural town. Alteratives with horizontal alignment and general design parameters such as design speed will be set. And each option will be evaluated at a general level
|
| 185 |
+
- Preliminary design (Phase 1)
|
| 186 |
+
- Smaller number of options (3-6)
|
| 187 |
+
- More detailed level
|
| 188 |
+
- e.g., for the rural bypass, it may include horizontal and vertical profiles, road cross section, and others... Each option will be evaluated to assess economical, social, and environmental viability and recommendation may be for small or one alternatives to go to the next stage, the detailed design.
|
| 189 |
+
- Detailed/Final design (Phase 2)
|
| 190 |
+
- Suitable level of design details for construction purposes
|
| 191 |
+
- e.g., for the bypass example, the geometry, cross section, and all, will be finalised. All elements will be ready for construction.
|
| 192 |
+
|
| 193 |
+
# 01-14-00 brief introduction of design elements
|
| 194 |
+
**source** W 1.2 The Road Traffic Design System, page 20
|
| 195 |
+
**content** design elements are the components of the physical road that require design. Here is the list: horizontal & vertical geometry, road reserve, traffic lanes, shoulders, roadside, road furniture, intersections, pavement, drainage.
|
| 196 |
+
|
| 197 |
+
# 01-15-00 National road safety strategy
|
| 198 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 8
|
| 199 |
+
**content** National Road Safety Strategy 2011/2020
|
| 200 |
+
- No person should be killed or seriously injured on Australia’s roads
|
| 201 |
+
- Reduce the annual number of road crash fatalities by at least 30 per cent
|
| 202 |
+
- Reduce the annual number of serious road crash injuries by at least 30 per cent.
|
| 203 |
+
- The strategy embraces the Safe System approach to road safety improvement
|
| 204 |
+
|
| 205 |
+
# 01-16-00 brief introduction of Safe system approach
|
| 206 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 9
|
| 207 |
+
**content** - Derived from Sweden’s ‘Vision Zero’ and the Dutch ‘Sustainable Safety’ approaches
|
| 208 |
+
- Crashes will continue to occur in spite of best efforts to prevent them (human error)
|
| 209 |
+
- Aims to eliminate fatal crashes and reduce serious injury crashes
|
| 210 |
+
- Safety interventions based on the best available scientific evidence of what is effective
|
| 211 |
+
- A shared responsibility:
|
| 212 |
+
- Road users responsible for complying with system rules
|
| 213 |
+
- System designers and operators responsible for the road-transport system
|
| 214 |
+
|
| 215 |
+
# 01-17-00 Safe system principles
|
| 216 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 10
|
| 217 |
+
**content** When crashes do occur, crash energies need to reduce to levels that minimize the probability of death and serious injury
|
| 218 |
+
- Provision of a safe, crashworthy system that is forgiving of human error and accommodates vulnerability to serious injury
|
| 219 |
+
- Recognition that all aspects of the transport system work together
|
| 220 |
+
- Manage the road infrastructure, speeds, vehicles and road users and the interactions between these components to achieve safe mobility
|
| 221 |
+
|
| 222 |
+
# 01-18-00 key ideals of the Safe System approach
|
| 223 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 11
|
| 224 |
+
**content** The Safe System approach aims to create a forgiving road system based on four key ideals:
|
| 225 |
+
- People make mistakes – and some crashes are inevitable.
|
| 226 |
+
- People are vulnerable –Our bodies have a limited ability to withstand crash forces without being seriously injured or killed.
|
| 227 |
+
- We need to share responsibility –System designers and people who use the roads must all share responsibility for creating a road system where crash forces do not result in death or serious injury.
|
| 228 |
+
- We need to strengthen all parts of the system – We need to improve the safety of all parts of the system – roads and roadsides, speeds, vehicles, and road use so that if one part fails, other parts will still protect the people involved.
|
| 229 |
+
|
| 230 |
+
# 01-19-00 Components of the Safe System
|
| 231 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 16
|
| 232 |
+
**content** As the diagram [image/01-19-00 Components of the Safe System.jpg],there are four components of the "SAFE SYSTEM":
|
| 233 |
+
- Safe roads – that are predictable and forgiving of mistakes. They are self-explaining in that their design encourages safe travel speeds (road factor).
|
| 234 |
+
- Safe speeds – travel speeds suit the function and level of safety of the road. People understand and comply with the speed limits and drive to the conditions (speed factor).
|
| 235 |
+
- Safe vehicles – that prevent crashes and protect road users, including pedestrians and cyclists, in the event of a crash (vehicle factor).
|
| 236 |
+
- Safe road use – road users that are skilled and competent, alert and unimpaired. They comply with road rules, take steps to improve safety, and demand and expect safety improvements (driver or rider factor).
|
| 237 |
+
|
| 238 |
+
# 01-20-00 Design safe system must consider vulnerable road users
|
| 239 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 17
|
| 240 |
+
**content** A transport system which aspires to achieve Safe System objectives cannot ignore: The vulnerability of some road user groups (a major determinant of injury outcome)
|
| 241 |
+
Any safe system needs to be designed to protect our most vulnerable: Pedestrians, Cyclists, Motorcyclists, Children, The elderly.
|
| 242 |
+
|
| 243 |
+
# 01-21-00 Speed factor
|
| 244 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 19 & 20
|
| 245 |
+
**content** In a 60km/h zone, travelling at:
|
| 246 |
+
- 65 km/h, you are twiceas likely to have a serious crashthan if you drive at 60 km/h
|
| 247 |
+
- 70 km/h, fourtimes
|
| 248 |
+
- 75 km/h, tentimes
|
| 249 |
+
- 80 km/h, 32 times
|
| 250 |
+
Vehicle speed is the crucial safety issue for vulnerable road users: Speed is an even more powerful determinant of injury severity in the event of a collision.
|
| 251 |
+
|
| 252 |
+
# 01-22-00 Speed management
|
| 253 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 21
|
| 254 |
+
**content** Historically, the emphasis has been on mobility
|
| 255 |
+
- that is, we strive for the highest mobility until safety is compromised “too much” (may impact the level of road)
|
| 256 |
+
- current speed limits and road design tend to favour high mobility often at the expense of safety
|
| 257 |
+
- is this the balance we want?
|
| 258 |
+
In general, Australian urban speed limits are 10 km/h higher than in other Western countries
|
| 259 |
+
- However, speeds should be at levels that do not exceed human tolerance to violent forces
|
| 260 |
+
|
| 261 |
+
# 01-23-00 Speed choice factors
|
| 262 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 22
|
| 263 |
+
**content** Factors that impact the choice of driver speed include: Driver factors, road & traffic factors, speed zone, crash & injury risk, police enforcement, education & publicity & promotion, and vehicle factors.
|
| 264 |
+
|
| 265 |
+
# 01-24-00 Speed limits
|
| 266 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 23
|
| 267 |
+
**content** - Speed limits, arguably, have the single largest influence on driver/rider choice of speed.
|
| 268 |
+
- Studies consistently show that crash incidence or crash severity decline whenever speed limits have been reduced
|
| 269 |
+
- After 50 km/h introduced in local streets in Victoria, pedestrian casualty crashes fell by 20% and pedestrian serious casualty crashes by 45%
|
| 270 |
+
- Freeway speed limit reduced from 110km/h to 100km/h (Victoria, 1989) and resulted in a 19% drop in casualty crash rate on freeways
|
| 271 |
+
- Negligible time savings by driving faster
|
| 272 |
+
|
| 273 |
+
# 01-25-00 Driver or Rider factors
|
| 274 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 24
|
| 275 |
+
**content** Drivers and riders:
|
| 276 |
+
- Are human and,therefore, make errors
|
| 277 |
+
- Consistently over-estimate what is a “safe” speed(poor risk perception)
|
| 278 |
+
- Are influenced by perceptions about risks of being “caught”
|
| 279 |
+
- Are concerned more about crash than injury risk
|
| 280 |
+
- Drive faster due to others around them -thus, a contagious effect
|
| 281 |
+
- Young drivers (less skilled) with young passengers travel faster than when alone
|
| 282 |
+
- Knowledge and attitudesabout speed only partly determine behaviour–other motives are often at play
|
| 283 |
+
|
| 284 |
+
# 01-26-00 The licensing system: entry to exit
|
| 285 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 25
|
| 286 |
+
**content** Licensing systems are a key foundation in Australia road transport system in order to manage the driver/rider factors
|
| 287 |
+
Licensing systems can:
|
| 288 |
+
- Establish/assess entry level ability/skill
|
| 289 |
+
- Monitor maintenance of those skills
|
| 290 |
+
- Decide on continued fitness to drive
|
| 291 |
+
- Promote overall safe mobility of drivers
|
| 292 |
+
- Regulate who can drive
|
| 293 |
+
- Manage illegal driving (combined with enforcement, demerit and sanctions)
|
| 294 |
+
|
| 295 |
+
# 01-27-00 Vehicle factors
|
| 296 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 26
|
| 297 |
+
**content** Even the best vehicles cannot protect their occupants in many common crash types.
|
| 298 |
+
“Safe” impact speeds generally fall below 30 to 50 km/h for:
|
| 299 |
+
- side-impacts at intersections
|
| 300 |
+
- collisions with trees and poles
|
| 301 |
+
- pedestrians and other vulnerable road users
|
| 302 |
+
- crashes between incompatible vehicle types
|
| 303 |
+
Low short-term potential for raising the level of “safe” impact speeds through improvements in crashworthiness.
|
| 304 |
+
It is probable that drivers over-estimate the protective capabilities of their vehicles.
|
| 305 |
+
|
| 306 |
+
# 01-28-00 Threshold impact speeds (human tolerance to violence)
|
| 307 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 27
|
| 308 |
+
**content** Vehicles cannot protect us in high-speed crashes. The following table provides the threshold impact speeds (human tolerance to violence) under different conditions:
|
| 309 |
+
| Scenarios | Speed range |
|
| 310 |
+
|---|---|
|
| 311 |
+
| car/pedestrian | 20-30 km/h |
|
| 312 |
+
| car/motorcyclist | 20-30 km/h |
|
| 313 |
+
| car/tree or pole | 30-40 km/h |
|
| 314 |
+
| car/car (side-impact) | 50 km/h |
|
| 315 |
+
| car/car (head-on) | 70 km/h |
|
| 316 |
+
|
| 317 |
+
Estimated that if all drivers were in safer vehicles, we could achieve a 40% reduction in fatal crashes.
|
| 318 |
+
|
| 319 |
+
# 01-29-00 Road factors
|
| 320 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 30 & 31
|
| 321 |
+
**content** Road features play a vital role in a driver/rider’s speed choice:
|
| 322 |
+
- Roadsides and the presence of trees, poles, etc.
|
| 323 |
+
- Intersections, and their design and operational features
|
| 324 |
+
- Alignment of roads and their design speeds
|
| 325 |
+
- Road surface condition
|
| 326 |
+
- "Road appearance" conditions: road width, number of lanes, roadside activity, divided/undivided, alignment (vertical and horizontal)
|
| 327 |
+
|
| 328 |
+
We can also involve the Traffic conditions (e.g., presence of other road users, general congestion or lack of congestion).
|
| 329 |
+
|
| 330 |
+
# 01-30-00 Roads and Roadsides
|
| 331 |
+
**source** W 1.3 The Safe System Approach to Road Safety, page 32
|
| 332 |
+
**content** Ways addressing high priority crashes types (high priority crash types typically involve incidents with a high risk of serious injury or death. Examples include fatal pedestrian crashes, single-vehicle frontal crashes with objects, and front-to-front or front-to-side collisions):
|
| 333 |
+
- Intersection design:
|
| 334 |
+
- Adequate sight distance and speed reduction
|
| 335 |
+
- Well designed roundabouts: low entry angle, good sight zone
|
| 336 |
+
- Separation and safer roadsides:
|
| 337 |
+
- Separate motorcycle/bicycle lanes, footpaths
|
| 338 |
+
- Use of good barrier systems
|
| 339 |
+
- Major grade separation of railway crossings on arterials
|
| 340 |
+
|
query_log.csv
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,timestamp,search_info,query_and_response,thumb_feedback
|
| 2 |
+
1,2025-06-16 06:03:20,"
|
| 3 |
+
**Query Analysis:**
|
| 4 |
+
- Query: what is safe system
|
| 5 |
+
- Summary of history: The user is asking for the definition or explanation of the term ""safe system,"" but has not provided additional context or details about the specific field or application they are referring to.
|
| 6 |
+
- Rewrite query: What is a ""safe system,"" and can you explain its meaning or significance in a general or specific context?
|
| 7 |
+
- Key entities: safe system
|
| 8 |
+
|
| 9 |
+
**Relevant Sources:**
|
| 10 |
+
- [01-17-00] Safe system principles (Relevance: 0.652)
|
| 11 |
+
- [01-18-00] key ideals of the Safe System approach (Relevance: 0.595)
|
| 12 |
+
- [01-19-00] Components of the Safe System (Relevance: 0.575)
|
| 13 |
+
|
| 14 |
+
**Response:**
|
| 15 |
+
The vector db has been cached, containing 39 entries
|
| 16 |
+
|
| 17 |
+
","Query: what is safe system
|
| 18 |
+
Response: The Safe System is an approach to road safety that aims to create a forgiving and resilient road environment, recognizing that crashes can happen but should result in minimal harm. It involves managing all aspects of the transport system—roads, speeds, vehicles, and road users—so that if one component fails, others will still protect people from death or serious injury. The approach emphasizes that crash energies need to be reduced to safe levels, and it promotes a system where human error is accommodated and vulnerability to injury is minimized.
|
| 19 |
+
|
| 20 |
+
Key principles include:
|
| 21 |
+
- Designing roads that are predictable and forgiving of mistakes.
|
| 22 |
+
- Ensuring travel speeds are appropriate for safety.
|
| 23 |
+
- Using vehicles that prevent crashes and protect occupants.
|
| 24 |
+
- Promoting skilled, alert, and responsible road users.
|
| 25 |
+
|
| 26 |
+
The overall goal is to share responsibility among system designers and users to achieve safer mobility for everyone.
|
| 27 |
+
|
| 28 |
+
**Sources:**
|
| 29 |
+
- ""W 1.3 The Safe System Approach to Road Safety,"" pages 10, 11, and 16.
|
| 30 |
+
|
| 31 |
+
**Title:** Safe system principles
|
| 32 |
+
**From:** W 1.3 The Safe System Approach to Road Safety, page 10",
|
| 33 |
+
1,2025-06-16 06:12:07,"
|
| 34 |
+
**Query Analysis:**
|
| 35 |
+
- Query: list the guest lectures
|
| 36 |
+
- Summary of history: The user has asked to list the guest lectures, but the request is brief and lacks details such as the context (e.g., event, course, conference) or specific information about which guest lectures they are referring to.
|
| 37 |
+
- Rewrite query: Please provide a list of the guest lectures for the upcoming university seminar series.
|
| 38 |
+
- Key entities: guest lectures, university seminar series
|
| 39 |
+
|
| 40 |
+
**Relevant Sources:**
|
| 41 |
+
- [00-07-00] Unit schedule and class info (Relevance: 0.363)
|
| 42 |
+
- [00-04-00] Timetable and location for class (Relevance: 0.356)
|
| 43 |
+
- [00-05-00] Online link for class (Relevance: 0.328)
|
| 44 |
+
|
| 45 |
+
**Response:**
|
| 46 |
+
The vector db has been cached, containing 39 entries
|
| 47 |
+
|
| 48 |
+
","Query: list the guest lectures
|
| 49 |
+
Response: The guest lectures for this unit are as follows:
|
| 50 |
+
|
| 51 |
+
1. Week 1: Safe System Approach by Mr. Ross Thompson from TraffixGroup.
|
| 52 |
+
2. Week 3: ""North East Link Project"" by colleagues from SMEC.
|
| 53 |
+
3. Week 5: ""Geometric Design"" by Mr. Malcom Mak from Australia Road Research Board (ARRB).
|
| 54 |
+
4. Week 8: ""Detailed Geometric Design"" by colleagues from SMEC.
|
| 55 |
+
5. Week 10: Weigh-in-Motion Technology and its application in pavement design (guest lecturer not specified).
|
| 56 |
+
6. Week 11: Austroads Design Methods by Prof. Jayantha Kodikara from SPARC Hub Monash University.
|
| 57 |
+
|
| 58 |
+
Please note that the schedule of guest lectures is subject to change.
|
| 59 |
+
|
| 60 |
+
**Sources:**
|
| 61 |
+
- Title: Unit schedule and class info
|
| 62 |
+
- From: learning on Moodle - getting started",
|
| 63 |
+
1,2025-06-16 06:12:53,"
|
| 64 |
+
**Query Analysis:**
|
| 65 |
+
- Query: where should i take class
|
| 66 |
+
- Summary of history: The user previously asked for a list of guest lectures, which included topics and speakers for specific weeks. Now, they are asking about the location or venue where they should attend a class, likely related to the lectures or the course in general.
|
| 67 |
+
- Rewrite query: Where should I attend the class related to these guest lectures?
|
| 68 |
+
- Key entities: class, guest lectures, topics, speakers, weeks, location, venue, course
|
| 69 |
+
|
| 70 |
+
**Relevant Sources:**
|
| 71 |
+
- [01-31-00] test (Relevance: 0.389)
|
| 72 |
+
- [00-08-00] Unit contacts (Relevance: 0.349)
|
| 73 |
+
- [00-05-00] Online link for class (Relevance: 0.328)
|
| 74 |
+
- [00-07-00] Unit schedule and class info (Relevance: 0.327)
|
| 75 |
+
- [00-03-00] Checklist of important items you should do (Relevance: 0.309)
|
| 76 |
+
- [00-04-00] Timetable and location for class (Relevance: 0.296)
|
| 77 |
+
- [01-08-00] Road functions (Relevance: 0.291)
|
| 78 |
+
- [01-29-00] Road factors (Relevance: 0.250)
|
| 79 |
+
- [01-14-00] brief introduction of design elements (Relevance: 0.243)
|
| 80 |
+
|
| 81 |
+
**Response:**
|
| 82 |
+
The vector db has been cached, containing 39 entries
|
| 83 |
+
|
| 84 |
+
","Query: where should i take class
|
| 85 |
+
Response: You should attend the workshop class in the Learning and Teaching Building, G81 (main) and G56 (overflow). The class is scheduled for Monday from 7 pm to 9 pm. Additionally, there are practice sessions scheduled in G29/33 B72 on Wednesdays at 10 am-12 pm and 1 pm-3 pm, which you should attend according to your timetable.
|
| 86 |
+
|
| 87 |
+
**Sources:**
|
| 88 |
+
- Title: Timetable and location for class
|
| 89 |
+
- From: learning on Moodle - getting started",
|
| 90 |
+
1,2025-06-16 06:27:47,"
|
| 91 |
+
**Query Analysis:**
|
| 92 |
+
- Query: list the guest lecturers
|
| 93 |
+
- Summary of history: The user has asked for a list of guest lecturers, but the context of which event, course, or institution is not specified. They likely want a comprehensive or specific list related to a particular program or organization.
|
| 94 |
+
- Rewrite query: Please provide a list of guest lecturers for the [specific event, course, or institution], or clarify which guest lecturers you are referring to.
|
| 95 |
+
- Key entities: guest lecturers, event, course, institution
|
| 96 |
+
|
| 97 |
+
**Relevant Sources:**
|
| 98 |
+
- [01-31-00] test (Relevance: 0.447)
|
| 99 |
+
- [00-07-00] Unit schedule and class info (Relevance: 0.357)
|
| 100 |
+
- [00-05-00] Online link for class (Relevance: 0.354)
|
| 101 |
+
- [00-04-00] Timetable and location for class (Relevance: 0.335)
|
| 102 |
+
- [00-08-00] Unit contacts (Relevance: 0.327)
|
| 103 |
+
- [00-01-00] What this unit is about (Relevance: 0.200)
|
| 104 |
+
|
| 105 |
+
**Response:**
|
| 106 |
+
The vector db has been cached, containing 39 entries
|
| 107 |
+
|
| 108 |
+
","Query: list the guest lecturers
|
| 109 |
+
Response: The guest lecturers for this unit are:
|
| 110 |
+
|
| 111 |
+
1. Mr. Ross Thompson from TraffixGroup, who will give a lecture on the Safe System Approach in Week 1.
|
| 112 |
+
2. Colleagues from SMEC, who will give a lecture on ""North East Link Project"" in Week 3 and lead a practical activity on ""Multi-criteria analysis.""
|
| 113 |
+
3. Mr. Malcolm Mak from Australia Road Research Board (ARRB), who will lecture on “geometric design” in Week 5.
|
| 114 |
+
4. Colleagues from SMEC, who will lead a guest lecture and practical activity on “detailed geometric design” in Week 8.
|
| 115 |
+
5. Dr. Tareq and Dr. Shan from the Department of Transport Planning, Victoria, who will lecture on “pavement materials” in Week 9.
|
| 116 |
+
6. Prof. Jayantha Kodikara from SPARC Hub Monash University, who will give a lecture on “Austroads design methods” in Week 11.
|
| 117 |
+
|
| 118 |
+
Title: Unit schedule and class info
|
| 119 |
+
From: learning on Moodle - getting started
|
| 120 |
+
Content: The schedule of guest lectures is mentioned in the weekly topics.
|
| 121 |
+
|
| 122 |
+
",thumbs_up
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.34.0
|
| 2 |
+
openai==1.86.0
|
| 3 |
+
pandas==2.2.3
|
| 4 |
+
numpy==2.2.3
|
| 5 |
+
huggingface-hub==0.33.0
|
| 6 |
+
scipy==1.15.2
|
vector_database.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vector_metadata.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vectorize_knowledge_base.py
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import List, Dict, Tuple, Optional
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import csv
|
| 10 |
+
|
| 11 |
+
class KnowledgeBaseVectorizer:
|
| 12 |
+
def __init__(self, api_key: str, data_path: str = ""):
|
| 13 |
+
"""
|
| 14 |
+
初始化向量化器
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
api_key: OpenAI API密钥
|
| 18 |
+
data_path: knowledge_base.md文件的路径
|
| 19 |
+
"""
|
| 20 |
+
self.client = OpenAI(api_key=api_key)
|
| 21 |
+
self.data_path = data_path
|
| 22 |
+
self.embedding_model = "text-embedding-3-small"
|
| 23 |
+
#self.vector_db_path = os.path.join(os.path.dirname(data_path), "vector_database.csv")
|
| 24 |
+
#self.metadata_path = os.path.join(os.path.dirname(data_path), "vector_metadata.json")
|
| 25 |
+
self.vector_db_path = "vector_database.csv"
|
| 26 |
+
self.metadata_path = "vector_metadata.json"
|
| 27 |
+
|
| 28 |
+
# 缓存相关属性
|
| 29 |
+
self._cached_df = None
|
| 30 |
+
self._cached_metadata = None
|
| 31 |
+
self._cached_embeddings = {} # 缓存不同类型的向量矩阵
|
| 32 |
+
self._last_load_time = None
|
| 33 |
+
|
| 34 |
+
def parse_knowledge_base(self) -> List[Dict]:
|
| 35 |
+
"""
|
| 36 |
+
解析knowledge_base.md文件,提取所有数据条目
|
| 37 |
+
支持包含表格的完整内容提取
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
包含所有数据条目的列表,每个条目是一个字典
|
| 41 |
+
"""
|
| 42 |
+
entries = []
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
with open(self.data_path, 'r', encoding='utf-8') as f:
|
| 46 |
+
content = f.read()
|
| 47 |
+
except FileNotFoundError:
|
| 48 |
+
print(f"错误:找不到文件 {self.data_path}")
|
| 49 |
+
return entries
|
| 50 |
+
|
| 51 |
+
# 改进的匹配策略:使用更精确的正则表达式
|
| 52 |
+
# 匹配模式:# xx-xx-xx title **source** ... **content** ... (直到下一个 # 或文件结尾)
|
| 53 |
+
pattern = r'#\s+(\d{2}-\d{2}-\d{2})\s+([^\n]+)\s+\*\*source\*\*\s+([^\n]+)\s+\*\*content\*\*\s+(.*?)(?=\n#\s+\d{2}-\d{2}-\d{2}|$)'
|
| 54 |
+
|
| 55 |
+
matches = re.findall(pattern, content, re.DOTALL)
|
| 56 |
+
|
| 57 |
+
for match in matches:
|
| 58 |
+
# 清理内容:移除多余的空白行,但保留表格格式
|
| 59 |
+
content_text = match[3].strip()
|
| 60 |
+
|
| 61 |
+
# 保留表格的结构,但清理多余的空白
|
| 62 |
+
content_lines = content_text.split('\n')
|
| 63 |
+
cleaned_lines = []
|
| 64 |
+
|
| 65 |
+
for line in content_lines:
|
| 66 |
+
# 保留非空行和表格行
|
| 67 |
+
if line.strip() or (line.startswith('|') and line.endswith('|')):
|
| 68 |
+
cleaned_lines.append(line.rstrip())
|
| 69 |
+
|
| 70 |
+
# 重新组合内容
|
| 71 |
+
cleaned_content = '\n'.join(cleaned_lines)
|
| 72 |
+
|
| 73 |
+
entry = {
|
| 74 |
+
'id': match[0].strip(),
|
| 75 |
+
'title': match[1].strip(),
|
| 76 |
+
'source': match[2].strip(),
|
| 77 |
+
'content': cleaned_content,
|
| 78 |
+
'full_text': f"{match[1].strip()} {cleaned_content}" # 用于向量化的完整文本
|
| 79 |
+
}
|
| 80 |
+
entries.append(entry)
|
| 81 |
+
|
| 82 |
+
print(f"成功解析 {len(entries)} 个数据条目")
|
| 83 |
+
|
| 84 |
+
# 打印一些调试信息
|
| 85 |
+
if entries:
|
| 86 |
+
print("前3个条目的内容长度:")
|
| 87 |
+
for i, entry in enumerate(entries[:3]):
|
| 88 |
+
content_lines = entry['content'].count('\n') + 1
|
| 89 |
+
has_table = '|' in entry['content']
|
| 90 |
+
print(f" 条目 {entry['id']}: {len(entry['content'])} 字符, {content_lines} 行, 包含表格: {has_table}")
|
| 91 |
+
|
| 92 |
+
return entries
|
| 93 |
+
|
| 94 |
+
def get_embedding(self, text: str) -> List[float]:
|
| 95 |
+
"""
|
| 96 |
+
使用OpenAI API获取文本的向量表示
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
text: 要向量化的文本
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
文本的向量表示
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
response = self.client.embeddings.create(
|
| 106 |
+
input=text,
|
| 107 |
+
model=self.embedding_model
|
| 108 |
+
)
|
| 109 |
+
return response.data[0].embedding
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"获取向量时出错: {e}")
|
| 112 |
+
return []
|
| 113 |
+
|
| 114 |
+
def batch_get_embeddings(self, texts: List[str], batch_size: int = 10) -> List[List[float]]:
|
| 115 |
+
"""
|
| 116 |
+
批量获取文本的向量表示
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
texts: 要向量化的文本列表
|
| 120 |
+
batch_size: 批处理大小
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
向量列表
|
| 124 |
+
"""
|
| 125 |
+
embeddings = []
|
| 126 |
+
|
| 127 |
+
for i in range(0, len(texts), batch_size):
|
| 128 |
+
batch = texts[i:i + batch_size]
|
| 129 |
+
print(f"处理批次 {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
response = self.client.embeddings.create(
|
| 133 |
+
input=batch,
|
| 134 |
+
model=self.embedding_model
|
| 135 |
+
)
|
| 136 |
+
batch_embeddings = [item.embedding for item in response.data]
|
| 137 |
+
embeddings.extend(batch_embeddings)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"批次处理出错: {e}")
|
| 140 |
+
# 如果批处理失败,尝试单个处理
|
| 141 |
+
for text in batch:
|
| 142 |
+
embedding = self.get_embedding(text)
|
| 143 |
+
embeddings.append(embedding if embedding else [0] * 1536) # 默认维度
|
| 144 |
+
|
| 145 |
+
return embeddings
|
| 146 |
+
|
| 147 |
+
def create_vector_database(self):
|
| 148 |
+
"""
|
| 149 |
+
创建向量数据库并保存为CSV文件
|
| 150 |
+
支持标题和内容的分别向量化
|
| 151 |
+
"""
|
| 152 |
+
print("开始创建向量数据库...")
|
| 153 |
+
|
| 154 |
+
# 1. 解析知识库
|
| 155 |
+
entries = self.parse_knowledge_base()
|
| 156 |
+
if not entries:
|
| 157 |
+
print("没有找到任何数据条目")
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
# 2. 准备要向量化的文本
|
| 161 |
+
titles = [entry['title'] for entry in entries]
|
| 162 |
+
contents = [entry['content'] for entry in entries]
|
| 163 |
+
full_texts = [entry['full_text'] for entry in entries]
|
| 164 |
+
|
| 165 |
+
# 3. 批量获取向量
|
| 166 |
+
print("开始向量化标题...")
|
| 167 |
+
title_embeddings = self.batch_get_embeddings(titles)
|
| 168 |
+
|
| 169 |
+
print("开始向量化内容...")
|
| 170 |
+
content_embeddings = self.batch_get_embeddings(contents)
|
| 171 |
+
|
| 172 |
+
print("开始向量化完整文本...")
|
| 173 |
+
full_embeddings = self.batch_get_embeddings(full_texts)
|
| 174 |
+
|
| 175 |
+
# 4. 创建DataFrame来存储数据
|
| 176 |
+
print("创建向量数据库DataFrame...")
|
| 177 |
+
|
| 178 |
+
# 准备数据行
|
| 179 |
+
rows = []
|
| 180 |
+
for i, entry in enumerate(entries):
|
| 181 |
+
row = {
|
| 182 |
+
'index': i,
|
| 183 |
+
'id': entry['id'],
|
| 184 |
+
'title': entry['title'],
|
| 185 |
+
'source': entry['source'],
|
| 186 |
+
'content': entry['content'],
|
| 187 |
+
'full_text': entry['full_text']
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
# 添加标题向量维度
|
| 191 |
+
for j, val in enumerate(title_embeddings[i]):
|
| 192 |
+
row[f'title_dim_{j}'] = val
|
| 193 |
+
|
| 194 |
+
# 添加内容向量维度
|
| 195 |
+
for j, val in enumerate(content_embeddings[i]):
|
| 196 |
+
row[f'content_dim_{j}'] = val
|
| 197 |
+
|
| 198 |
+
# 添加完整文本向量维度
|
| 199 |
+
for j, val in enumerate(full_embeddings[i]):
|
| 200 |
+
row[f'full_dim_{j}'] = val
|
| 201 |
+
|
| 202 |
+
rows.append(row)
|
| 203 |
+
|
| 204 |
+
# 创建DataFrame
|
| 205 |
+
df = pd.DataFrame(rows)
|
| 206 |
+
|
| 207 |
+
# 5. 保存为CSV文件
|
| 208 |
+
print("保存向量数据库到CSV...")
|
| 209 |
+
df.to_csv(self.vector_db_path, index=False, encoding='utf-8')
|
| 210 |
+
|
| 211 |
+
# 6. 保存元数据(JSON格式,便于查看)
|
| 212 |
+
metadata = {
|
| 213 |
+
'embedding_model': self.embedding_model,
|
| 214 |
+
'created_at': datetime.now().isoformat(),
|
| 215 |
+
'num_entries': len(entries),
|
| 216 |
+
'embedding_dimensions': len(title_embeddings[0]) if title_embeddings else 0,
|
| 217 |
+
'vector_types': ['title', 'content', 'full'],
|
| 218 |
+
'columns': list(df.columns),
|
| 219 |
+
'entries_summary': [
|
| 220 |
+
{
|
| 221 |
+
'id': entry['id'],
|
| 222 |
+
'title': entry['title'],
|
| 223 |
+
'source': entry['source']
|
| 224 |
+
} for entry in entries
|
| 225 |
+
]
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
with open(self.metadata_path, 'w', encoding='utf-8') as f:
|
| 229 |
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
| 230 |
+
|
| 231 |
+
print(f"向量数据库创建完成!")
|
| 232 |
+
print(f"向量数据库保存在: {self.vector_db_path}")
|
| 233 |
+
print(f"元数据保存在: {self.metadata_path}")
|
| 234 |
+
print(f"总共处理了 {len(entries)} 个条目")
|
| 235 |
+
print(f"每个向量的维度: {len(title_embeddings[0]) if title_embeddings else 0}")
|
| 236 |
+
|
| 237 |
+
# 清除缓存以便重新加载
|
| 238 |
+
self.clear_cache()
|
| 239 |
+
|
| 240 |
+
def clear_cache(self):
|
| 241 |
+
"""清除所有缓存"""
|
| 242 |
+
self._cached_df = None
|
| 243 |
+
self._cached_metadata = None
|
| 244 |
+
self._cached_embeddings = {}
|
| 245 |
+
self._last_load_time = None
|
| 246 |
+
print("向量数据库缓存已清除")
|
| 247 |
+
|
| 248 |
+
def load_vector_database(self, force_reload: bool = False) -> Tuple[Optional[pd.DataFrame], Optional[Dict]]:
|
| 249 |
+
"""
|
| 250 |
+
从CSV文件加载向量数据库(带缓存机制)
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
force_reload: 是否强制重新加载
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
DataFrame和元数据字典的元组
|
| 257 |
+
"""
|
| 258 |
+
# 检查是否需要重新加载
|
| 259 |
+
if not force_reload and self._cached_df is not None and self._cached_metadata is not None:
|
| 260 |
+
return self._cached_df, self._cached_metadata
|
| 261 |
+
|
| 262 |
+
try:
|
| 263 |
+
# 加载CSV文件
|
| 264 |
+
df = pd.read_csv(self.vector_db_path, encoding='utf-8')
|
| 265 |
+
|
| 266 |
+
# 加载元数据
|
| 267 |
+
with open(self.metadata_path, 'r', encoding='utf-8') as f:
|
| 268 |
+
metadata = json.load(f)
|
| 269 |
+
|
| 270 |
+
# 缓存结果
|
| 271 |
+
self._cached_df = df
|
| 272 |
+
self._cached_metadata = metadata
|
| 273 |
+
self._last_load_time = datetime.now()
|
| 274 |
+
|
| 275 |
+
# 预加载向量矩阵到缓存
|
| 276 |
+
self._preload_embeddings()
|
| 277 |
+
|
| 278 |
+
print(f"成功加载向量数据库,包含 {len(df)} 个条目")
|
| 279 |
+
return df, metadata
|
| 280 |
+
except FileNotFoundError as e:
|
| 281 |
+
print(f"错误:找不到文件 - {e}")
|
| 282 |
+
return None, None
|
| 283 |
+
except Exception as e:
|
| 284 |
+
print(f"加载向量数据库时出错: {e}")
|
| 285 |
+
return None, None
|
| 286 |
+
|
| 287 |
+
def _preload_embeddings(self):
|
| 288 |
+
"""预加载所有类型的向量矩阵到缓存"""
|
| 289 |
+
if self._cached_df is None:
|
| 290 |
+
return
|
| 291 |
+
|
| 292 |
+
vector_types = ['title', 'content', 'full']
|
| 293 |
+
for vector_type in vector_types:
|
| 294 |
+
if vector_type not in self._cached_embeddings:
|
| 295 |
+
embeddings = self.get_embeddings_from_df(self._cached_df, vector_type)
|
| 296 |
+
# 预计算归一化向量
|
| 297 |
+
embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 298 |
+
self._cached_embeddings[vector_type] = {
|
| 299 |
+
'raw': embeddings,
|
| 300 |
+
'normalized': embeddings_norm
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
print(f"预加载了 {len(vector_types)} 种类型的向量矩阵")
|
| 304 |
+
|
| 305 |
+
def get_embeddings_from_df(self, df: pd.DataFrame, vector_type: str = 'full') -> np.ndarray:
|
| 306 |
+
"""
|
| 307 |
+
从DataFrame中提取向量矩阵
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
df: 包含向量的DataFrame
|
| 311 |
+
vector_type: 向量类型 ('title', 'content', 'full')
|
| 312 |
+
|
| 313 |
+
Returns:
|
| 314 |
+
向量矩阵
|
| 315 |
+
"""
|
| 316 |
+
# 根据类型获取对应的列
|
| 317 |
+
if vector_type == 'title':
|
| 318 |
+
embedding_cols = [col for col in df.columns if col.startswith('title_dim_')]
|
| 319 |
+
elif vector_type == 'content':
|
| 320 |
+
embedding_cols = [col for col in df.columns if col.startswith('content_dim_')]
|
| 321 |
+
else: # 'full'
|
| 322 |
+
embedding_cols = [col for col in df.columns if col.startswith('full_dim_')]
|
| 323 |
+
|
| 324 |
+
embeddings = df[embedding_cols].values
|
| 325 |
+
return embeddings
|
| 326 |
+
|
| 327 |
+
def batch_search_similar(self, queries: List[str], top_k: int = 5,
|
| 328 |
+
title_weight: float = 0.4,
|
| 329 |
+
content_weight: float = 0.3,
|
| 330 |
+
full_weight: float = 0.3) -> List[List[Tuple[Dict, float, Dict]]]:
|
| 331 |
+
"""
|
| 332 |
+
批量搜索多个查询,只加载一次向量数据库
|
| 333 |
+
|
| 334 |
+
Args:
|
| 335 |
+
queries: 查询文本列表
|
| 336 |
+
top_k: 每个查询返回最相似的前k个结果
|
| 337 |
+
title_weight: 标题相似度的权重
|
| 338 |
+
content_weight: 内容相似度的权重
|
| 339 |
+
full_weight: 完整文本相似度的权重
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
每个查询对应的相似条目列表
|
| 343 |
+
"""
|
| 344 |
+
# 确保权重之和为1
|
| 345 |
+
total_weight = title_weight + content_weight + full_weight
|
| 346 |
+
title_weight /= total_weight
|
| 347 |
+
content_weight /= total_weight
|
| 348 |
+
full_weight /= total_weight
|
| 349 |
+
|
| 350 |
+
# 加载向量数据库(只加载一次)
|
| 351 |
+
df, metadata = self.load_vector_database()
|
| 352 |
+
if df is None:
|
| 353 |
+
return [[] for _ in queries]
|
| 354 |
+
|
| 355 |
+
# 批量获取查询向量
|
| 356 |
+
print(f"批量生成 {len(queries)} 个查询的向量...")
|
| 357 |
+
query_embeddings = self.batch_get_embeddings(queries, batch_size=min(10, len(queries)))
|
| 358 |
+
|
| 359 |
+
if len(query_embeddings) != len(queries):
|
| 360 |
+
print("查询向量生成失败")
|
| 361 |
+
return [[] for _ in queries]
|
| 362 |
+
|
| 363 |
+
# 获取缓存的归一化向量矩阵
|
| 364 |
+
title_embeddings_norm = self._cached_embeddings['title']['normalized']
|
| 365 |
+
content_embeddings_norm = self._cached_embeddings['content']['normalized']
|
| 366 |
+
full_embeddings_norm = self._cached_embeddings['full']['normalized']
|
| 367 |
+
|
| 368 |
+
all_results = []
|
| 369 |
+
|
| 370 |
+
# 对每个查询进行相似度计算
|
| 371 |
+
for i, (query, query_embedding) in enumerate(zip(queries, query_embeddings)):
|
| 372 |
+
if not query_embedding:
|
| 373 |
+
all_results.append([])
|
| 374 |
+
continue
|
| 375 |
+
|
| 376 |
+
query_vec = np.array(query_embedding)
|
| 377 |
+
query_vec_norm = query_vec / np.linalg.norm(query_vec)
|
| 378 |
+
|
| 379 |
+
# 计算各部分的相似度
|
| 380 |
+
title_similarities = np.dot(title_embeddings_norm, query_vec_norm)
|
| 381 |
+
content_similarities = np.dot(content_embeddings_norm, query_vec_norm)
|
| 382 |
+
full_similarities = np.dot(full_embeddings_norm, query_vec_norm)
|
| 383 |
+
|
| 384 |
+
# 加权综合相似度
|
| 385 |
+
combined_similarities = (
|
| 386 |
+
title_weight * title_similarities +
|
| 387 |
+
content_weight * content_similarities +
|
| 388 |
+
full_weight * full_similarities
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
# 获取top-k
|
| 392 |
+
top_indices = np.argsort(combined_similarities)[::-1][:top_k]
|
| 393 |
+
|
| 394 |
+
query_results = []
|
| 395 |
+
for idx in top_indices:
|
| 396 |
+
# 从DataFrame中获取条目信息
|
| 397 |
+
row = df.iloc[idx]
|
| 398 |
+
entry = {
|
| 399 |
+
'id': row['id'],
|
| 400 |
+
'title': row['title'],
|
| 401 |
+
'source': row['source'],
|
| 402 |
+
'content': row['content']
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
# 添加各部分的相似度详情
|
| 406 |
+
similarity_details = {
|
| 407 |
+
'combined': float(combined_similarities[idx]),
|
| 408 |
+
'title': float(title_similarities[idx]),
|
| 409 |
+
'content': float(content_similarities[idx]),
|
| 410 |
+
'full': float(full_similarities[idx])
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
query_results.append((entry, float(combined_similarities[idx]), similarity_details))
|
| 414 |
+
|
| 415 |
+
all_results.append(query_results)
|
| 416 |
+
print(f"完成查询 {i+1}/{len(queries)}: '{query[:50]}...'")
|
| 417 |
+
|
| 418 |
+
return all_results
|
| 419 |
+
|
| 420 |
+
def search_similar(self, query: str, top_k: int = 5,
|
| 421 |
+
title_weight: float = 0.4,
|
| 422 |
+
content_weight: float = 0.3,
|
| 423 |
+
full_weight: float = 0.3) -> List[Tuple[Dict, float, Dict]]:
|
| 424 |
+
"""
|
| 425 |
+
搜索与查询最相似的条目,综合考虑标题和内容的相似度
|
| 426 |
+
使用批量搜索的优化版本
|
| 427 |
+
|
| 428 |
+
Args:
|
| 429 |
+
query: 查询文本
|
| 430 |
+
top_k: 返回最相似的前k个结果
|
| 431 |
+
title_weight: 标题相似度的权重
|
| 432 |
+
content_weight: 内容相似度的权重
|
| 433 |
+
full_weight: 完整文本相似度的权重
|
| 434 |
+
|
| 435 |
+
Returns:
|
| 436 |
+
相似条目和相似度分数的列表
|
| 437 |
+
"""
|
| 438 |
+
# 使用批量搜索处理单个查询
|
| 439 |
+
results = self.batch_search_similar([query], top_k, title_weight, content_weight, full_weight)
|
| 440 |
+
return results[0] if results else []
|
| 441 |
+
|
| 442 |
+
def search_with_entities_optimized(self, entities: List[str], top_k: int = 3) -> List[Tuple[Dict, float, Dict]]:
|
| 443 |
+
"""
|
| 444 |
+
优化版本:使用实体列表搜索知识库,只加载一次向量数据库
|
| 445 |
+
|
| 446 |
+
Args:
|
| 447 |
+
entities: 实体列表
|
| 448 |
+
top_k: 每个实体返回的结果数
|
| 449 |
+
|
| 450 |
+
Returns:
|
| 451 |
+
合并和去重后的搜索结果
|
| 452 |
+
"""
|
| 453 |
+
if not entities:
|
| 454 |
+
return []
|
| 455 |
+
|
| 456 |
+
# 使用批量搜索
|
| 457 |
+
batch_results = self.batch_search_similar(
|
| 458 |
+
entities,
|
| 459 |
+
top_k=top_k,
|
| 460 |
+
title_weight=0.5, # 对于实体搜索,标题权重更高
|
| 461 |
+
content_weight=0.3,
|
| 462 |
+
full_weight=0.2
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
# 合并结果并去重
|
| 466 |
+
seen_ids = set()
|
| 467 |
+
all_results = []
|
| 468 |
+
|
| 469 |
+
for entity_results in batch_results:
|
| 470 |
+
for entry, score, details in entity_results:
|
| 471 |
+
entry_id = entry['id']
|
| 472 |
+
if entry_id not in seen_ids:
|
| 473 |
+
seen_ids.add(entry_id)
|
| 474 |
+
all_results.append((entry, score, details))
|
| 475 |
+
|
| 476 |
+
# 按分数排序
|
| 477 |
+
sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
|
| 478 |
+
return sorted_results
|
| 479 |
+
|
| 480 |
+
def add_new_entry(self, id: str, title: str, source: str, content: str):
|
| 481 |
+
"""
|
| 482 |
+
添加新条目到向量数据库
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
id: 条目ID
|
| 486 |
+
title: 标题
|
| 487 |
+
source: 来源
|
| 488 |
+
content: 内容
|
| 489 |
+
"""
|
| 490 |
+
# 加载现有数据库
|
| 491 |
+
df, metadata = self.load_vector_database()
|
| 492 |
+
|
| 493 |
+
if df is None:
|
| 494 |
+
print("向量数据库不存在,将创建新的数据库")
|
| 495 |
+
df = pd.DataFrame()
|
| 496 |
+
|
| 497 |
+
# 创建新条目
|
| 498 |
+
full_text = f"{title} {content}"
|
| 499 |
+
|
| 500 |
+
# 获取三种类型的向量
|
| 501 |
+
print(f"正在为新条目 {id} 生成向量...")
|
| 502 |
+
title_embedding = self.get_embedding(title)
|
| 503 |
+
content_embedding = self.get_embedding(content)
|
| 504 |
+
full_embedding = self.get_embedding(full_text)
|
| 505 |
+
|
| 506 |
+
if not all([title_embedding, content_embedding, full_embedding]):
|
| 507 |
+
print("无法生成向量")
|
| 508 |
+
return
|
| 509 |
+
|
| 510 |
+
# 创建新条目
|
| 511 |
+
new_entry = {
|
| 512 |
+
'index': len(df),
|
| 513 |
+
'id': id,
|
| 514 |
+
'title': title,
|
| 515 |
+
'source': source,
|
| 516 |
+
'content': content,
|
| 517 |
+
'full_text': full_text
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
# 添加向量维度
|
| 521 |
+
for j, val in enumerate(title_embedding):
|
| 522 |
+
new_entry[f'title_dim_{j}'] = val
|
| 523 |
+
for j, val in enumerate(content_embedding):
|
| 524 |
+
new_entry[f'content_dim_{j}'] = val
|
| 525 |
+
for j, val in enumerate(full_embedding):
|
| 526 |
+
new_entry[f'full_dim_{j}'] = val
|
| 527 |
+
|
| 528 |
+
# 添加到DataFrame
|
| 529 |
+
new_df = pd.DataFrame([new_entry])
|
| 530 |
+
df = pd.concat([df, new_df], ignore_index=True)
|
| 531 |
+
|
| 532 |
+
# 保存更新后的数据库
|
| 533 |
+
df.to_csv(self.vector_db_path, index=False, encoding='utf-8')
|
| 534 |
+
|
| 535 |
+
# 更新元数据
|
| 536 |
+
if metadata:
|
| 537 |
+
metadata['num_entries'] = len(df)
|
| 538 |
+
metadata['updated_at'] = datetime.now().isoformat()
|
| 539 |
+
with open(self.metadata_path, 'w', encoding='utf-8') as f:
|
| 540 |
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
| 541 |
+
|
| 542 |
+
# 清除缓存以便重新加载
|
| 543 |
+
self.clear_cache()
|
| 544 |
+
|
| 545 |
+
print(f"成功添加新条目 {id}")
|
| 546 |
+
|
| 547 |
+
def export_to_readable_format(self, output_path: str = None):
|
| 548 |
+
"""
|
| 549 |
+
导出向量数据库到更易读的格式(不包含向量维度)
|
| 550 |
+
|
| 551 |
+
Args:
|
| 552 |
+
output_path: 输出文件路径
|
| 553 |
+
"""
|
| 554 |
+
df, _ = self.load_vector_database()
|
| 555 |
+
if df is None:
|
| 556 |
+
return
|
| 557 |
+
|
| 558 |
+
if output_path is None:
|
| 559 |
+
output_path = os.path.join(
|
| 560 |
+
os.path.dirname(self.data_path),
|
| 561 |
+
"vector_database_readable.csv"
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
# 只保留非向量列
|
| 565 |
+
non_vector_cols = [col for col in df.columns if not any(col.startswith(prefix) for prefix in ['title_dim_', 'content_dim_', 'full_dim_'])]
|
| 566 |
+
readable_df = df[non_vector_cols]
|
| 567 |
+
|
| 568 |
+
# 保存
|
| 569 |
+
readable_df.to_csv(output_path, index=False, encoding='utf-8')
|
| 570 |
+
print(f"可读格式的数据库已保存到: {output_path}")
|
| 571 |
+
|
| 572 |
+
def get_cache_info(self) -> Dict:
|
| 573 |
+
"""
|
| 574 |
+
获取缓存状态信息
|
| 575 |
+
|
| 576 |
+
Returns:
|
| 577 |
+
缓存状态字典
|
| 578 |
+
"""
|
| 579 |
+
return {
|
| 580 |
+
'is_cached': self._cached_df is not None,
|
| 581 |
+
'cache_size': len(self._cached_df) if self._cached_df is not None else 0,
|
| 582 |
+
'cached_embeddings': list(self._cached_embeddings.keys()),
|
| 583 |
+
'last_load_time': self._last_load_time.isoformat() if self._last_load_time else None
|
| 584 |
+
}
|