add data viewer
Browse files- app/law_document_chunker.py +18 -3
- app/supabase_db.py +8 -19
app/law_document_chunker.py
CHANGED
|
@@ -286,7 +286,17 @@ class LawDocumentChunker:
|
|
| 286 |
chunk_stack.append((metadata.id, current_level, current_level_value))
|
| 287 |
logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
|
| 288 |
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
return chunks
|
| 291 |
|
| 292 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str]]],
|
|
@@ -308,6 +318,8 @@ class LawDocumentChunker:
|
|
| 308 |
logger.info(f"[CHUNKER] Creating embeddings and storing {len(chunks)} chunks")
|
| 309 |
|
| 310 |
success_count = 0
|
|
|
|
|
|
|
| 311 |
for i, chunk in enumerate(chunks, 1):
|
| 312 |
try:
|
| 313 |
# Tạo embedding - TẠM THỜI COMMENT LẠI ĐỂ TEST
|
|
@@ -333,15 +345,18 @@ class LawDocumentChunker:
|
|
| 333 |
success = self.supabase_client.store_document_chunk(chunk_dict)
|
| 334 |
if success:
|
| 335 |
success_count += 1
|
| 336 |
-
|
|
|
|
| 337 |
else:
|
|
|
|
| 338 |
logger.error(f"[CHUNKER] Failed to store chunk {chunk.id}")
|
| 339 |
|
| 340 |
except Exception as e:
|
|
|
|
| 341 |
logger.error(f"[CHUNKER] Error processing chunk {chunk.id}: {e}")
|
| 342 |
continue
|
| 343 |
|
| 344 |
-
logger.info(f"[CHUNKER] Successfully processed {success_count}/{len(chunks)} chunks")
|
| 345 |
return success_count
|
| 346 |
|
| 347 |
async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
|
|
|
|
| 286 |
chunk_stack.append((metadata.id, current_level, current_level_value))
|
| 287 |
logger.debug(f"[CHUNKER] Created final chunk: {metadata.id[:8]}... Level: {current_level}, Parent: {current_parent}")
|
| 288 |
|
| 289 |
+
# Debug: Kiểm tra kết quả
|
| 290 |
+
root_count = sum(1 for chunk in chunks if chunk.cha is None)
|
| 291 |
+
logger.info(f"[CHUNKER] Created {len(chunks)} chunks, {root_count} root chunks")
|
| 292 |
+
|
| 293 |
+
# Debug: Log chi tiết từng chunk
|
| 294 |
+
for i, chunk in enumerate(chunks[:10]): # Log 10 chunks đầu tiên
|
| 295 |
+
logger.debug(f"[CHUNKER] Chunk {i+1}: {chunk.content[:100]}... -> Parent: {chunk.cha}")
|
| 296 |
+
|
| 297 |
+
if len(chunks) > 10:
|
| 298 |
+
logger.debug(f"[CHUNKER] ... and {len(chunks) - 10} more chunks")
|
| 299 |
+
|
| 300 |
return chunks
|
| 301 |
|
| 302 |
def _find_parent_for_level(self, chunk_stack: List[Tuple[str, str, Optional[str]]],
|
|
|
|
| 318 |
logger.info(f"[CHUNKER] Creating embeddings and storing {len(chunks)} chunks")
|
| 319 |
|
| 320 |
success_count = 0
|
| 321 |
+
failed_count = 0
|
| 322 |
+
|
| 323 |
for i, chunk in enumerate(chunks, 1):
|
| 324 |
try:
|
| 325 |
# Tạo embedding - TẠM THỜI COMMENT LẠI ĐỂ TEST
|
|
|
|
| 345 |
success = self.supabase_client.store_document_chunk(chunk_dict)
|
| 346 |
if success:
|
| 347 |
success_count += 1
|
| 348 |
+
if i % 100 == 0: # Log mỗi 100 chunks
|
| 349 |
+
logger.info(f"[CHUNKER] Stored chunk {i}/{len(chunks)}: {chunk.id[:8]}...")
|
| 350 |
else:
|
| 351 |
+
failed_count += 1
|
| 352 |
logger.error(f"[CHUNKER] Failed to store chunk {chunk.id}")
|
| 353 |
|
| 354 |
except Exception as e:
|
| 355 |
+
failed_count += 1
|
| 356 |
logger.error(f"[CHUNKER] Error processing chunk {chunk.id}: {e}")
|
| 357 |
continue
|
| 358 |
|
| 359 |
+
logger.info(f"[CHUNKER] Successfully processed {success_count}/{len(chunks)} chunks, {failed_count} failed")
|
| 360 |
return success_count
|
| 361 |
|
| 362 |
async def _store_chunks_to_supabase(self, chunk_data: List[Dict]) -> bool:
|
app/supabase_db.py
CHANGED
|
@@ -192,26 +192,15 @@ class SupabaseClient:
|
|
| 192 |
try:
|
| 193 |
logger.info("[SUPABASE] Fetching all document chunks")
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
offset = 0
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
response
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
all_chunks.extend(response.data)
|
| 207 |
-
offset += page_size
|
| 208 |
-
|
| 209 |
-
# Nếu số records ít hơn page_size, đã hết
|
| 210 |
-
if len(response.data) < page_size:
|
| 211 |
-
break
|
| 212 |
-
|
| 213 |
-
logger.info(f"[SUPABASE] Successfully fetched {len(all_chunks)} document chunks")
|
| 214 |
-
return all_chunks
|
| 215 |
|
| 216 |
except Exception as e:
|
| 217 |
logger.error(f"[SUPABASE] Error fetching document chunks: {e}")
|
|
|
|
| 192 |
try:
|
| 193 |
logger.info("[SUPABASE] Fetching all document chunks")
|
| 194 |
|
| 195 |
+
# Thử lấy tất cả records một lần
|
| 196 |
+
response = self.client.table('document_chunks').select('*').execute()
|
|
|
|
| 197 |
|
| 198 |
+
if response.data:
|
| 199 |
+
logger.info(f"[SUPABASE] Successfully fetched {len(response.data)} document chunks")
|
| 200 |
+
return response.data
|
| 201 |
+
else:
|
| 202 |
+
logger.warning("[SUPABASE] No document chunks found")
|
| 203 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
logger.error(f"[SUPABASE] Error fetching document chunks: {e}")
|