VietCat commited on
Commit
86342e6
·
1 Parent(s): 2630d5b

add data viewer

Browse files
Files changed (2) hide show
  1. app/law_document_chunker.py +18 -3
  2. app/supabase_db.py +8 -19
app/law_document_chunker.py CHANGED
@@ -286,7 +286,17 @@ class LawDocumentChunker:
286
  chunk_stack.append((metadata.id, current_level, current_level_value))
287
  logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
288
 
289
- logger.info(f"[CHUNKER] Created {len(chunks)} chunks from document")
 
 
 
 
 
 
 
 
 
 
290
  return chunks
291
 
292
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str]]],
@@ -308,6 +318,8 @@ class LawDocumentChunker:
308
  logger.info(f"[CHUNKER] Creating embeddings and storing {len(chunks)} chunks")
309
 
310
  success_count = 0
 
 
311
  for i, chunk in enumerate(chunks, 1):
312
  try:
313
  # Tạo embedding - TẠM THỜI COMMENT LẠI ĐỂ TEST
@@ -333,15 +345,18 @@ class LawDocumentChunker:
333
  success = self.supabase_client.store_document_chunk(chunk_dict)
334
  if success:
335
  success_count += 1
336
- logger.info(f"[CHUNKER] Stored chunk {i}/{len(chunks)}: {chunk.id[:8]}...")
 
337
  else:
 
338
  logger.error(f"[CHUNKER] Failed to store chunk {chunk.id}")
339
 
340
  except Exception as e:
 
341
  logger.error(f"[CHUNKER] Error processing chunk {chunk.id}: {e}")
342
  continue
343
 
344
- logger.info(f"[CHUNKER] Successfully processed {success_count}/{len(chunks)} chunks")
345
  return success_count
346
 
347
  async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
 
286
  chunk_stack.append((metadata.id, current_level, current_level_value))
287
  logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
288
 
289
+ # Debug: Kiểm tra kết quả
290
+ root_count = sum(1 for chunk in chunks if chunk.cha is None)
291
+ logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
292
+
293
+ # Debug: Log chi tiết từng chunk
294
+ for i, chunk in enumerate(chunks[:10]): # Log 10 chunks đầu tiên
295
+ logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
296
+
297
+ if len(chunks) > 10:
298
+ logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
299
+
300
  return chunks
301
 
302
  def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str]]],
 
318
  logger.info(f"[CHUNKER] Creating embeddings and storing {len(chunks)} chunks")
319
 
320
  success_count = 0
321
+ failed_count = 0
322
+
323
  for i, chunk in enumerate(chunks, 1):
324
  try:
325
  # Tạo embedding - TẠM THỜI COMMENT LẠI ĐỂ TEST
 
345
  success = self.supabase_client.store_document_chunk(chunk_dict)
346
  if success:
347
  success_count += 1
348
+ if i % 100 == 0: # Log mỗi 100 chunks
349
+ logger.info(f"[CHUNKER] Stored chunk {i}/{len(chunks)}: {chunk.id[:8]}...")
350
  else:
351
+ failed_count += 1
352
  logger.error(f"[CHUNKER] Failed to store chunk {chunk.id}")
353
 
354
  except Exception as e:
355
+ failed_count += 1
356
  logger.error(f"[CHUNKER] Error processing chunk {chunk.id}: {e}")
357
  continue
358
 
359
+ logger.info(f"[CHUNKER] Successfully processed {success_count}/{len(chunks)} chunks, {failed_count} failed")
360
  return success_count
361
 
362
  async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
app/supabase_db.py CHANGED
@@ -192,26 +192,15 @@ class SupabaseClient:
192
  try:
193
  logger.info("[SUPABASE] Fetching all document chunks")
194
 
195
- all_chunks = []
196
- page_size = 1000
197
- offset = 0
198
 
199
- while True:
200
- # Lấy từng page
201
- response = self.client.table('document_chunks').select('*').range(offset, offset + page_size - 1).execute()
202
-
203
- if not response.data:
204
- break
205
-
206
- all_chunks.extend(response.data)
207
- offset += page_size
208
-
209
- # Nếu số records ít hơn page_size, đã hết
210
- if len(response.data) < page_size:
211
- break
212
-
213
- logger.info(f"[SUPABASE] Successfully fetched {len(all_chunks)} document chunks")
214
- return all_chunks
215
 
216
  except Exception as e:
217
  logger.error(f"[SUPABASE] Error fetching document chunks: {e}")
 
192
  try:
193
  logger.info("[SUPABASE] Fetching all document chunks")
194
 
195
+ # Thử lấy tất cả records một lần
196
+ response = self.client.table('document_chunks').select('*').execute()
 
197
 
198
+ if response.data:
199
+ logger.info(f"[SUPABASE] Successfully fetched {len(response.data)} document chunks")
200
+ return response.data
201
+ else:
202
+ logger.warning("[SUPABASE] No document chunks found")
203
+ return []
 
 
 
 
 
 
 
 
 
 
204
 
205
  except Exception as e:
206
  logger.error(f"[SUPABASE] Error fetching document chunks: {e}")