VietCat commited on
Commit
880062b
·
1 Parent(s): c3199eb

add chunker

Browse files
Files changed (2) hide show
  1. app/law_document_chunker.py +22 -38
  2. app/supabase_db.py +37 -4
app/law_document_chunker.py CHANGED
@@ -261,12 +261,12 @@ class LawDocumentChunker:
261
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks from document")
262
  return chunks
263
 
264
- async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> List[Dict]:
265
- """Tạo embeddings cho các chunks."""
266
- logger.info(f"[CHUNKER] Creating embeddings for {len(chunks)} chunks")
267
 
268
- chunk_data = []
269
- for chunk in chunks:
270
  try:
271
  # Tạo embedding
272
  embedding = await self.embedding_client.create_embedding(chunk.content)
@@ -285,35 +285,26 @@ class LawDocumentChunker:
285
  'sub_clause_letter': chunk.sub_clause_letter,
286
  'context_summary': chunk.context_summary
287
  }
288
- chunk_data.append(chunk_dict)
289
 
290
- logger.debug(f"[CHUNKER] Created embedding for chunk {chunk.id[:8]}...")
 
 
 
 
 
 
291
 
292
  except Exception as e:
293
- logger.error(f"[CHUNKER] Error creating embedding for chunk {chunk.id}: {e}")
294
  continue
295
 
296
- logger.info(f"[CHUNKER] Successfully created embeddings for {len(chunk_data)} chunks")
297
- return chunk_data
298
 
299
  async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
300
- """Lưu chunks vào Supabase."""
301
- try:
302
- logger.info(f"[CHUNKER] Storing {len(chunk_data)} chunks to Supabase")
303
-
304
- # Lưu từng chunk
305
- for chunk in chunk_data:
306
- success = self.supabase_client.store_document_chunk(chunk)
307
- if not success:
308
- logger.error(f"[CHUNKER] Failed to store chunk {chunk['id']}")
309
- return False
310
-
311
- logger.info(f"[CHUNKER] Successfully stored all chunks to Supabase")
312
- return True
313
-
314
- except Exception as e:
315
- logger.error(f"[CHUNKER] Error storing chunks to Supabase: {e}")
316
- return False
317
 
318
  async def process_law_document(self, file_path: str, document_id: int) -> bool:
319
  """
@@ -351,21 +342,14 @@ class LawDocumentChunker:
351
  return False
352
 
353
  # 6. Tạo embeddings
354
- chunk_data = await self._create_embeddings_for_chunks(chunks)
355
 
356
- if not chunk_data:
357
  logger.error(f"[CHUNKER] No embeddings created for document {document_id}")
358
  return False
359
 
360
- # 7. Lưu vào Supabase
361
- success = await self._store_chunks_to_supabase(chunk_data)
362
-
363
- if success:
364
- logger.info(f"[CHUNKER] Successfully processed document {document_id} with {len(chunk_data)} chunks")
365
- else:
366
- logger.error(f"[CHUNKER] Failed to store chunks for document {document_id}")
367
-
368
- return success
369
 
370
  except Exception as e:
371
  logger.error(f"[CHUNKER] Error processing document {document_id}: {e}")
 
261
  logger.info(f"[CHUNKER] Created {len(chunks)} chunks from document")
262
  return chunks
263
 
264
+ async def _create_embeddings_for_chunks(self, chunks: List[ChunkMetadata]) -> int:
265
+ """Tạo embeddings cho các chunks và lưu ngay lập tức vào Supabase."""
266
+ logger.info(f"[CHUNKER] Creating embeddings and storing {len(chunks)} chunks")
267
 
268
+ success_count = 0
269
+ for i, chunk in enumerate(chunks, 1):
270
  try:
271
  # Tạo embedding
272
  embedding = await self.embedding_client.create_embedding(chunk.content)
 
285
  'sub_clause_letter': chunk.sub_clause_letter,
286
  'context_summary': chunk.context_summary
287
  }
 
288
 
289
+ # Lưu ngay lập tức vào Supabase
290
+ success = self.supabase_client.store_document_chunk(chunk_dict)
291
+ if success:
292
+ success_count += 1
293
+ logger.info(f"[CHUNKER] Stored chunk {i}/{len(chunks)}: {chunk.id[:8]}...")
294
+ else:
295
+ logger.error(f"[CHUNKER] Failed to store chunk {chunk.id}")
296
 
297
  except Exception as e:
298
+ logger.error(f"[CHUNKER] Error processing chunk {chunk.id}: {e}")
299
  continue
300
 
301
+ logger.info(f"[CHUNKER] Successfully processed {success_count}/{len(chunks)} chunks")
302
+ return success_count
303
 
304
  async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
305
+ """Legacy method - không còn sử dụng."""
306
+ logger.warning("[CHUNKER] _store_chunks_to_supabase is deprecated, use _create_embeddings_for_chunks instead")
307
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  async def process_law_document(self, file_path: str, document_id: int) -> bool:
310
  """
 
342
  return False
343
 
344
  # 6. Tạo embeddings
345
+ success_count = await self._create_embeddings_for_chunks(chunks)
346
 
347
+ if success_count == 0:
348
  logger.error(f"[CHUNKER] No embeddings created for document {document_id}")
349
  return False
350
 
351
+ logger.info(f"[CHUNKER] Successfully processed document {document_id} with {success_count} chunks")
352
+ return True
 
 
 
 
 
 
 
353
 
354
  except Exception as e:
355
  logger.error(f"[CHUNKER] Error processing document {document_id}: {e}")
app/supabase_db.py CHANGED
@@ -86,13 +86,45 @@ class SupabaseClient:
86
  Output: bool (True nếu thành công, False nếu lỗi)
87
  """
88
  try:
89
- response = self.client.table('document_chunks').insert(chunk_data).execute()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  if response.data:
92
- logger.info(f"Successfully stored chunk {chunk_data.get('id', 'unknown')}")
93
  return True
94
  else:
95
- logger.error(f"Failed to store chunk {chunk_data.get('id', 'unknown')}")
96
  return False
97
 
98
  except Exception as e:
@@ -106,7 +138,8 @@ class SupabaseClient:
106
  Output: bool (True nếu thành công, False nếu lỗi)
107
  """
108
  try:
109
- response = self.client.table('document_chunks').delete().neq('id', '').execute()
 
110
  logger.info(f"Successfully deleted all document chunks")
111
  return True
112
  except Exception as e:
 
86
  Output: bool (True nếu thành công, False nếu lỗi)
87
  """
88
  try:
89
+ # Xử lý các giá trị null/empty cho integer fields
90
+ processed_data = chunk_data.copy()
91
+
92
+ # Xử lý article_number - chỉ gửi nếu có giá trị hợp lệ
93
+ if 'article_number' in processed_data:
94
+ if processed_data['article_number'] is None or processed_data['article_number'] == "":
95
+ processed_data['article_number'] = None
96
+ elif isinstance(processed_data['article_number'], str):
97
+ try:
98
+ processed_data['article_number'] = int(processed_data['article_number'])
99
+ except (ValueError, TypeError):
100
+ processed_data['article_number'] = None
101
+
102
+ # Xử lý vanbanid - đảm bảo là integer
103
+ if 'vanbanid' in processed_data:
104
+ if isinstance(processed_data['vanbanid'], str):
105
+ try:
106
+ processed_data['vanbanid'] = int(processed_data['vanbanid'])
107
+ except (ValueError, TypeError):
108
+ logger.error(f"Invalid vanbanid: {processed_data['vanbanid']}")
109
+ return False
110
+
111
+ # Xử lý các trường text - chuyển empty string thành None
112
+ text_fields = ['document_title', 'article_title', 'clause_number', 'sub_clause_letter', 'context_summary']
113
+ for field in text_fields:
114
+ if field in processed_data and processed_data[field] == "":
115
+ processed_data[field] = None
116
+
117
+ # Xử lý cha field - chuyển empty string thành None
118
+ if 'cha' in processed_data and processed_data['cha'] == "":
119
+ processed_data['cha'] = None
120
+
121
+ response = self.client.table('document_chunks').insert(processed_data).execute()
122
 
123
  if response.data:
124
+ logger.info(f"Successfully stored chunk {processed_data.get('id', 'unknown')}")
125
  return True
126
  else:
127
+ logger.error(f"Failed to store chunk {processed_data.get('id', 'unknown')}")
128
  return False
129
 
130
  except Exception as e:
 
138
  Output: bool (True nếu thành công, False nếu lỗi)
139
  """
140
  try:
141
+ # Xóa tất cả records trong bảng
142
+ response = self.client.table('document_chunks').delete().execute()
143
  logger.info(f"Successfully deleted all document chunks")
144
  return True
145
  except Exception as e: