linxinhua commited on
Commit
64af45d
·
verified ·
1 Parent(s): 21bff06

Update vectorize_knowledge_base.py via admin tool

Browse files
Files changed (1) hide show
  1. vectorize_knowledge_base.py +63 -132
vectorize_knowledge_base.py CHANGED
@@ -9,21 +9,28 @@ from datetime import datetime
9
  import csv
10
 
11
  class KnowledgeBaseVectorizer:
12
- def __init__(self, api_key: str, data_path: str = ""):
13
  """
14
- 初始化向量化器
15
 
16
  Args:
17
  api_key: OpenAI API密钥
18
- data_path: knowledge_base.md文件的路径
 
19
  """
20
  self.client = OpenAI(api_key=api_key)
21
- self.data_path = data_path
22
  self.embedding_model = "text-embedding-3-small"
23
- #self.vector_db_path = os.path.join(os.path.dirname(data_path), "vector_database.csv")
24
- #self.metadata_path = os.path.join(os.path.dirname(data_path), "vector_metadata.json")
25
- self.vector_db_path = "vector_database.csv"
26
- self.metadata_path = "vector_metadata.json"
 
 
 
 
 
 
 
27
 
28
  # 缓存相关属性
29
  self._cached_df = None
@@ -31,6 +38,11 @@ class KnowledgeBaseVectorizer:
31
  self._cached_embeddings = {} # 缓存不同类型的向量矩阵
32
  self._last_load_time = None
33
 
 
 
 
 
 
34
  def parse_knowledge_base(self) -> List[Dict]:
35
  """
36
  解析knowledge_base.md文件,提取所有数据条目
@@ -44,8 +56,12 @@ class KnowledgeBaseVectorizer:
44
  try:
45
  with open(self.data_path, 'r', encoding='utf-8') as f:
46
  content = f.read()
 
47
  except FileNotFoundError:
48
- print(f"错误:找不到文件 {self.data_path}")
 
 
 
49
  return entries
50
 
51
  # 改进的匹配策略:使用更精确的正则表达式
@@ -79,15 +95,15 @@ class KnowledgeBaseVectorizer:
79
  }
80
  entries.append(entry)
81
 
82
- print(f"成功解析 {len(entries)} 个数据条目")
83
 
84
  # 打印一些调试信息
85
  if entries:
86
- print("3个条目的内容长度:")
87
  for i, entry in enumerate(entries[:3]):
88
  content_lines = entry['content'].count('\n') + 1
89
  has_table = '|' in entry['content']
90
- print(f" 条目 {entry['id']}: {len(entry['content'])} 字符, {content_lines} , 包含表格: {has_table}")
91
 
92
  return entries
93
 
@@ -108,7 +124,7 @@ class KnowledgeBaseVectorizer:
108
  )
109
  return response.data[0].embedding
110
  except Exception as e:
111
- print(f"获取向量时出错: {e}")
112
  return []
113
 
114
  def batch_get_embeddings(self, texts: List[str], batch_size: int = 10) -> List[List[float]]:
@@ -126,7 +142,7 @@ class KnowledgeBaseVectorizer:
126
 
127
  for i in range(0, len(texts), batch_size):
128
  batch = texts[i:i + batch_size]
129
- print(f"处理批次 {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
130
 
131
  try:
132
  response = self.client.embeddings.create(
@@ -136,7 +152,7 @@ class KnowledgeBaseVectorizer:
136
  batch_embeddings = [item.embedding for item in response.data]
137
  embeddings.extend(batch_embeddings)
138
  except Exception as e:
139
- print(f"批次处理出错: {e}")
140
  # 如果批处理失败,尝试单个处理
141
  for text in batch:
142
  embedding = self.get_embedding(text)
@@ -149,12 +165,12 @@ class KnowledgeBaseVectorizer:
149
  创建向量数据库并保存为CSV文件
150
  支持标题和内容的分别向量化
151
  """
152
- print("开始创建向量数据库...")
153
 
154
  # 1. 解析知识库
155
  entries = self.parse_knowledge_base()
156
  if not entries:
157
- print("没有找到任何数据条目")
158
  return
159
 
160
  # 2. 准备要向量化的文本
@@ -163,17 +179,17 @@ class KnowledgeBaseVectorizer:
163
  full_texts = [entry['full_text'] for entry in entries]
164
 
165
  # 3. 批量获取向量
166
- print("开始向量化标题...")
167
  title_embeddings = self.batch_get_embeddings(titles)
168
 
169
- print("开始向量化内容...")
170
  content_embeddings = self.batch_get_embeddings(contents)
171
 
172
- print("开始向量化完整文本...")
173
  full_embeddings = self.batch_get_embeddings(full_texts)
174
 
175
  # 4. 创建DataFrame来存储数据
176
- print("创建向量数据库DataFrame...")
177
 
178
  # 准备数据行
179
  rows = []
@@ -205,7 +221,7 @@ class KnowledgeBaseVectorizer:
205
  df = pd.DataFrame(rows)
206
 
207
  # 5. 保存为CSV文件
208
- print("保存向量数据库到CSV...")
209
  df.to_csv(self.vector_db_path, index=False, encoding='utf-8')
210
 
211
  # 6. 保存元数据(JSON格式,便于查看)
@@ -228,11 +244,11 @@ class KnowledgeBaseVectorizer:
228
  with open(self.metadata_path, 'w', encoding='utf-8') as f:
229
  json.dump(metadata, f, ensure_ascii=False, indent=2)
230
 
231
- print(f"向量数据库创建完成!")
232
- print(f"向量数据库保存在: {self.vector_db_path}")
233
- print(f"元数据保存在: {self.metadata_path}")
234
- print(f"总共处理了 {len(entries)} 个条目")
235
- print(f"每个向量的维度: {len(title_embeddings[0]) if title_embeddings else 0}")
236
 
237
  # 清除缓存以便重新加载
238
  self.clear_cache()
@@ -243,7 +259,7 @@ class KnowledgeBaseVectorizer:
243
  self._cached_metadata = None
244
  self._cached_embeddings = {}
245
  self._last_load_time = None
246
- print("向量数据库缓存已清除")
247
 
248
  def load_vector_database(self, force_reload: bool = False) -> Tuple[Optional[pd.DataFrame], Optional[Dict]]:
249
  """
@@ -261,9 +277,11 @@ class KnowledgeBaseVectorizer:
261
 
262
  try:
263
  # 加载CSV文件
 
264
  df = pd.read_csv(self.vector_db_path, encoding='utf-8')
265
 
266
  # 加载元数据
 
267
  with open(self.metadata_path, 'r', encoding='utf-8') as f:
268
  metadata = json.load(f)
269
 
@@ -275,13 +293,13 @@ class KnowledgeBaseVectorizer:
275
  # 预加载向量矩阵到缓存
276
  self._preload_embeddings()
277
 
278
- print(f"成功加载向量数据库,包含 {len(df)} 个条目")
279
  return df, metadata
280
  except FileNotFoundError as e:
281
- print(f"错误:找不到文件 - {e}")
282
  return None, None
283
  except Exception as e:
284
- print(f"加载向量数据库时出错: {e}")
285
  return None, None
286
 
287
  def _preload_embeddings(self):
@@ -300,7 +318,7 @@ class KnowledgeBaseVectorizer:
300
  'normalized': embeddings_norm
301
  }
302
 
303
- print(f"预加载了 {len(vector_types)} 种类型的向量矩阵")
304
 
305
  def get_embeddings_from_df(self, df: pd.DataFrame, vector_type: str = 'full') -> np.ndarray:
306
  """
@@ -353,11 +371,11 @@ class KnowledgeBaseVectorizer:
353
  return [[] for _ in queries]
354
 
355
  # 批量获取查询向量
356
- print(f"批量生成 {len(queries)} 个查询的向量...")
357
  query_embeddings = self.batch_get_embeddings(queries, batch_size=min(10, len(queries)))
358
 
359
  if len(query_embeddings) != len(queries):
360
- print("查询向量生成失败")
361
  return [[] for _ in queries]
362
 
363
  # 获取缓存的归一化向量矩阵
@@ -413,7 +431,7 @@ class KnowledgeBaseVectorizer:
413
  query_results.append((entry, float(combined_similarities[idx]), similarity_details))
414
 
415
  all_results.append(query_results)
416
- print(f"完成查询 {i+1}/{len(queries)}: '{query[:50]}...'")
417
 
418
  return all_results
419
 
@@ -439,7 +457,7 @@ class KnowledgeBaseVectorizer:
439
  results = self.batch_search_similar([query], top_k, title_weight, content_weight, full_weight)
440
  return results[0] if results else []
441
 
442
- def search_with_entities_optimized(self, entities: List[str], top_k: int = 3) -> List[Tuple[Dict, float, Dict]]:
443
  """
444
  优化版本:使用实体列表搜索知识库,只加载一次向量数据库
445
 
@@ -457,8 +475,8 @@ class KnowledgeBaseVectorizer:
457
  batch_results = self.batch_search_similar(
458
  entities,
459
  top_k=top_k,
460
- title_weight=0.5, # 对于实体搜索,标题权重更高
461
- content_weight=0.3,
462
  full_weight=0.2
463
  )
464
 
@@ -477,98 +495,6 @@ class KnowledgeBaseVectorizer:
477
  sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
478
  return sorted_results
479
 
480
- def add_new_entry(self, id: str, title: str, source: str, content: str):
481
- """
482
- 添加新条目到向量数据库
483
-
484
- Args:
485
- id: 条目ID
486
- title: 标题
487
- source: 来源
488
- content: 内容
489
- """
490
- # 加载现有数据库
491
- df, metadata = self.load_vector_database()
492
-
493
- if df is None:
494
- print("向量数据库不存在,将创建新的数据库")
495
- df = pd.DataFrame()
496
-
497
- # 创建新条目
498
- full_text = f"{title} {content}"
499
-
500
- # 获取三种类型的向量
501
- print(f"正在为新条目 {id} 生成向量...")
502
- title_embedding = self.get_embedding(title)
503
- content_embedding = self.get_embedding(content)
504
- full_embedding = self.get_embedding(full_text)
505
-
506
- if not all([title_embedding, content_embedding, full_embedding]):
507
- print("无法生成向量")
508
- return
509
-
510
- # 创建新条目
511
- new_entry = {
512
- 'index': len(df),
513
- 'id': id,
514
- 'title': title,
515
- 'source': source,
516
- 'content': content,
517
- 'full_text': full_text
518
- }
519
-
520
- # 添加向量维度
521
- for j, val in enumerate(title_embedding):
522
- new_entry[f'title_dim_{j}'] = val
523
- for j, val in enumerate(content_embedding):
524
- new_entry[f'content_dim_{j}'] = val
525
- for j, val in enumerate(full_embedding):
526
- new_entry[f'full_dim_{j}'] = val
527
-
528
- # 添加到DataFrame
529
- new_df = pd.DataFrame([new_entry])
530
- df = pd.concat([df, new_df], ignore_index=True)
531
-
532
- # 保存更新后的数据库
533
- df.to_csv(self.vector_db_path, index=False, encoding='utf-8')
534
-
535
- # 更新元数据
536
- if metadata:
537
- metadata['num_entries'] = len(df)
538
- metadata['updated_at'] = datetime.now().isoformat()
539
- with open(self.metadata_path, 'w', encoding='utf-8') as f:
540
- json.dump(metadata, f, ensure_ascii=False, indent=2)
541
-
542
- # 清除缓存以便重新加载
543
- self.clear_cache()
544
-
545
- print(f"成功添加新条目 {id}")
546
-
547
- def export_to_readable_format(self, output_path: str = None):
548
- """
549
- 导出向量数据库到更易读的格式(不包含向量维度)
550
-
551
- Args:
552
- output_path: 输出文件路径
553
- """
554
- df, _ = self.load_vector_database()
555
- if df is None:
556
- return
557
-
558
- if output_path is None:
559
- output_path = os.path.join(
560
- os.path.dirname(self.data_path),
561
- "vector_database_readable.csv"
562
- )
563
-
564
- # 只保留非向量列
565
- non_vector_cols = [col for col in df.columns if not any(col.startswith(prefix) for prefix in ['title_dim_', 'content_dim_', 'full_dim_'])]
566
- readable_df = df[non_vector_cols]
567
-
568
- # 保存
569
- readable_df.to_csv(output_path, index=False, encoding='utf-8')
570
- print(f"可读格式的数据库已保存到: {output_path}")
571
-
572
  def get_cache_info(self) -> Dict:
573
  """
574
  获取缓存状态信息
@@ -580,5 +506,10 @@ class KnowledgeBaseVectorizer:
580
  'is_cached': self._cached_df is not None,
581
  'cache_size': len(self._cached_df) if self._cached_df is not None else 0,
582
  'cached_embeddings': list(self._cached_embeddings.keys()),
583
- 'last_load_time': self._last_load_time.isoformat() if self._last_load_time else None
584
- }
 
 
 
 
 
 
9
  import csv
10
 
11
  class KnowledgeBaseVectorizer:
12
+ def __init__(self, api_key: str, data_path: str = "", vector_db_dir: str = ""):
13
  """
14
+ 初始化向量化器(适配学生Space)
15
 
16
  Args:
17
  api_key: OpenAI API密钥
18
+ data_path: knowledge_base.md文件的路径(如果为空,使用vector_db_dir中的文件)
19
+ vector_db_dir: 向量数据库所在目录(通常是数据存储仓库的本地目录)
20
  """
21
  self.client = OpenAI(api_key=api_key)
 
22
  self.embedding_model = "text-embedding-3-small"
23
+
24
+ # 如果指定了vector_db_dir,优先使用该目录中的文件
25
+ if vector_db_dir:
26
+ self.data_path = os.path.join(vector_db_dir, "knowledge_base.md")
27
+ self.vector_db_path = os.path.join(vector_db_dir, "vector_database.csv")
28
+ self.metadata_path = os.path.join(vector_db_dir, "vector_metadata.json")
29
+ else:
30
+ # 保持原有逻辑用于向后兼容
31
+ self.data_path = data_path if data_path else "knowledge_base.md"
32
+ self.vector_db_path = "vector_database.csv"
33
+ self.metadata_path = "vector_metadata.json"
34
 
35
  # 缓存相关属性
36
  self._cached_df = None
 
38
  self._cached_embeddings = {} # 缓存不同类型的向量矩阵
39
  self._last_load_time = None
40
 
41
+ print(f"[KnowledgeBaseVectorizer] Initialized with:")
42
+ print(f" - Knowledge base: {self.data_path}")
43
+ print(f" - Vector database: {self.vector_db_path}")
44
+ print(f" - Metadata: {self.metadata_path}")
45
+
46
  def parse_knowledge_base(self) -> List[Dict]:
47
  """
48
  解析knowledge_base.md文件,提取所有数据条目
 
56
  try:
57
  with open(self.data_path, 'r', encoding='utf-8') as f:
58
  content = f.read()
59
+ print(f"[parse_knowledge_base] Successfully read file: {self.data_path}")
60
  except FileNotFoundError:
61
+ print(f"[parse_knowledge_base] Error: File not found - {self.data_path}")
62
+ return entries
63
+ except Exception as e:
64
+ print(f"[parse_knowledge_base] Error reading file: {e}")
65
  return entries
66
 
67
  # 改进的匹配策略:使用更精确的正则表达式
 
95
  }
96
  entries.append(entry)
97
 
98
+ print(f"[parse_knowledge_base] Successfully parsed {len(entries)} entries")
99
 
100
  # 打印一些调试信息
101
  if entries:
102
+ print("[parse_knowledge_base] First 3 entries info:")
103
  for i, entry in enumerate(entries[:3]):
104
  content_lines = entry['content'].count('\n') + 1
105
  has_table = '|' in entry['content']
106
+ print(f" Entry {entry['id']}: {len(entry['content'])} chars, {content_lines} lines, has table: {has_table}")
107
 
108
  return entries
109
 
 
124
  )
125
  return response.data[0].embedding
126
  except Exception as e:
127
+ print(f"[get_embedding] Error: {e}")
128
  return []
129
 
130
  def batch_get_embeddings(self, texts: List[str], batch_size: int = 10) -> List[List[float]]:
 
142
 
143
  for i in range(0, len(texts), batch_size):
144
  batch = texts[i:i + batch_size]
145
+ print(f"[batch_get_embeddings] Processing batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
146
 
147
  try:
148
  response = self.client.embeddings.create(
 
152
  batch_embeddings = [item.embedding for item in response.data]
153
  embeddings.extend(batch_embeddings)
154
  except Exception as e:
155
+ print(f"[batch_get_embeddings] Batch error: {e}")
156
  # 如果批处理失败,尝试单个处理
157
  for text in batch:
158
  embedding = self.get_embedding(text)
 
165
  创建向量数据库并保存为CSV文件
166
  支持标题和内容的分别向量化
167
  """
168
+ print("[create_vector_database] Starting to create vector database...")
169
 
170
  # 1. 解析知识库
171
  entries = self.parse_knowledge_base()
172
  if not entries:
173
+ print("[create_vector_database] No entries found")
174
  return
175
 
176
  # 2. 准备要向量化的文本
 
179
  full_texts = [entry['full_text'] for entry in entries]
180
 
181
  # 3. 批量获取向量
182
+ print("[create_vector_database] Vectorizing titles...")
183
  title_embeddings = self.batch_get_embeddings(titles)
184
 
185
+ print("[create_vector_database] Vectorizing contents...")
186
  content_embeddings = self.batch_get_embeddings(contents)
187
 
188
+ print("[create_vector_database] Vectorizing full texts...")
189
  full_embeddings = self.batch_get_embeddings(full_texts)
190
 
191
  # 4. 创建DataFrame来存储数据
192
+ print("[create_vector_database] Creating DataFrame...")
193
 
194
  # 准备数据行
195
  rows = []
 
221
  df = pd.DataFrame(rows)
222
 
223
  # 5. 保存为CSV文件
224
+ print(f"[create_vector_database] Saving to {self.vector_db_path}...")
225
  df.to_csv(self.vector_db_path, index=False, encoding='utf-8')
226
 
227
  # 6. 保存元数据(JSON格式,便于查看)
 
244
  with open(self.metadata_path, 'w', encoding='utf-8') as f:
245
  json.dump(metadata, f, ensure_ascii=False, indent=2)
246
 
247
+ print(f"[create_vector_database] Vector database created successfully!")
248
+ print(f" - Vector database saved to: {self.vector_db_path}")
249
+ print(f" - Metadata saved to: {self.metadata_path}")
250
+ print(f" - Processed {len(entries)} entries")
251
+ print(f" - Vector dimensions: {len(title_embeddings[0]) if title_embeddings else 0}")
252
 
253
  # 清除缓存以便重新加载
254
  self.clear_cache()
 
259
  self._cached_metadata = None
260
  self._cached_embeddings = {}
261
  self._last_load_time = None
262
+ print("[clear_cache] Vector database cache cleared")
263
 
264
  def load_vector_database(self, force_reload: bool = False) -> Tuple[Optional[pd.DataFrame], Optional[Dict]]:
265
  """
 
277
 
278
  try:
279
  # 加载CSV文件
280
+ print(f"[load_vector_database] Loading from {self.vector_db_path}")
281
  df = pd.read_csv(self.vector_db_path, encoding='utf-8')
282
 
283
  # 加载元数据
284
+ print(f"[load_vector_database] Loading metadata from {self.metadata_path}")
285
  with open(self.metadata_path, 'r', encoding='utf-8') as f:
286
  metadata = json.load(f)
287
 
 
293
  # 预加载向量矩阵到缓存
294
  self._preload_embeddings()
295
 
296
+ print(f"[load_vector_database] Successfully loaded vector database with {len(df)} entries")
297
  return df, metadata
298
  except FileNotFoundError as e:
299
+ print(f"[load_vector_database] Error: File not found - {e}")
300
  return None, None
301
  except Exception as e:
302
+ print(f"[load_vector_database] Error loading vector database: {e}")
303
  return None, None
304
 
305
  def _preload_embeddings(self):
 
318
  'normalized': embeddings_norm
319
  }
320
 
321
+ print(f"[_preload_embeddings] Preloaded {len(vector_types)} types of vector matrices")
322
 
323
  def get_embeddings_from_df(self, df: pd.DataFrame, vector_type: str = 'full') -> np.ndarray:
324
  """
 
371
  return [[] for _ in queries]
372
 
373
  # 批量获取查询向量
374
+ print(f"[batch_search_similar] Generating vectors for {len(queries)} queries...")
375
  query_embeddings = self.batch_get_embeddings(queries, batch_size=min(10, len(queries)))
376
 
377
  if len(query_embeddings) != len(queries):
378
+ print("[batch_search_similar] Query vector generation failed")
379
  return [[] for _ in queries]
380
 
381
  # 获取缓存的归一化向量矩阵
 
431
  query_results.append((entry, float(combined_similarities[idx]), similarity_details))
432
 
433
  all_results.append(query_results)
434
+ print(f"[batch_search_similar] Completed query {i+1}/{len(queries)}: '{query[:50]}...'")
435
 
436
  return all_results
437
 
 
457
  results = self.batch_search_similar([query], top_k, title_weight, content_weight, full_weight)
458
  return results[0] if results else []
459
 
460
+ def search_with_entities_optimized(self, entities: List[str], top_k: int = 5) -> List[Tuple[Dict, float, Dict]]:
461
  """
462
  优化版本:使用实体列表搜索知识库,只加载一次向量数据库
463
 
 
475
  batch_results = self.batch_search_similar(
476
  entities,
477
  top_k=top_k,
478
+ title_weight=0.3, # 对于实体搜索,标题权重更高
479
+ content_weight=0.5,
480
  full_weight=0.2
481
  )
482
 
 
495
  sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
496
  return sorted_results
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  def get_cache_info(self) -> Dict:
499
  """
500
  获取缓存状态信息
 
506
  'is_cached': self._cached_df is not None,
507
  'cache_size': len(self._cached_df) if self._cached_df is not None else 0,
508
  'cached_embeddings': list(self._cached_embeddings.keys()),
509
+ 'last_load_time': self._last_load_time.isoformat() if self._last_load_time else None,
510
+ 'data_paths': {
511
+ 'knowledge_base': self.data_path,
512
+ 'vector_database': self.vector_db_path,
513
+ 'metadata': self.metadata_path
514
+ }
515
+ }