sửa truy vấn bảng

Browse files

Files changed (6) hide show

core/embeddings/chunk.py +120 -6
core/embeddings/retrival.py +18 -3
core/embeddings/vector_store.py +103 -11
scripts/test_single_file.py +94 -0
test/chunk_results.md +0 -0
test/test_small_to_big.py +191 -0

core/embeddings/chunk.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from __future__ import annotations
 import re
 from pathlib import Path
-from typing import List, Tuple, Dict, Any
 import yaml
 from llama_index.core import Document
 from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
 from llama_index.core.schema import BaseNode, TextNode
@@ -13,6 +16,12 @@ CHUNK_OVERLAP = 150
 MIN_CHUNK_SIZE = 200
 TABLE_ROWS_PER_CHUNK = 15
 # Regex
 COURSE_PATTERN = re.compile(r"Học\s*phần\s+(.+?)\s*\(\s*m[ãa]\s+([^\)]+)\)", re.I | re.DOTALL)
 TABLE_PLACEHOLDER = re.compile(r"__TBL_(\d+)__")
@@ -94,6 +103,101 @@ def _split_table(header: str, rows: List[str], max_rows: int = TABLE_ROWS_PER_CH
     return [header + "\n".join(r) for r in chunks]
 def _enrich_metadata(node: BaseNode, source_path: Path | None) -> None:
     if source_path:
@@ -158,13 +262,23 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
             if (before := content[last_end:match.start()].strip()) and len(before) >= MIN_CHUNK_SIZE:
                 nodes.extend(_chunk_text(before, meta) if len(before) > CHUNK_SIZE else [TextNode(text=before, metadata=meta.copy())])
-            # Table chunks
             if (idx := int(match.group(1))) < len(tables):
                 header, rows = tables[idx]
-                chunks = _split_table(header, rows)
-                for i, chunk in enumerate(chunks):
-                    tbl_meta = {**meta, "is_table": True, **({"table_part": f"{i+1}/{len(chunks)}"} if len(chunks) > 1 else {})}
-                    nodes.append(TextNode(text=chunk, metadata=tbl_meta))
             last_end = match.end()
         # Text after table

 from __future__ import annotations
+import os
 import re
+import uuid
 from pathlib import Path
+from typing import List, Tuple, Dict, Any, Optional
 import yaml
+from openai import OpenAI
 from llama_index.core import Document
 from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
 from llama_index.core.schema import BaseNode, TextNode
 MIN_CHUNK_SIZE = 200
 TABLE_ROWS_PER_CHUNK = 15
+# Small-to-Big Config
+ENABLE_TABLE_SUMMARY = True
+MIN_TABLE_ROWS_FOR_SUMMARY = 5  # Only summarize tables with >= 5 rows
+SUMMARY_MODEL = "nex-agi/DeepSeek-V3.1-Nex-N1"
+SILICONFLOW_BASE_URL = "https://api.siliconflow.com/v1"
 # Regex
 COURSE_PATTERN = re.compile(r"Học\s*phần\s+(.+?)\s*\(\s*m[ãa]\s+([^\)]+)\)", re.I | re.DOTALL)
 TABLE_PLACEHOLDER = re.compile(r"__TBL_(\d+)__")
     return [header + "\n".join(r) for r in chunks]
+_summary_client: Optional[OpenAI] = None
+def _get_summary_client() -> Optional[OpenAI]:
+    global _summary_client
+    if _summary_client is not None:
+        return _summary_client
+    api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
+    if not api_key:
+        print("SILICONFLOW_API_KEY not set. Table summarization disabled.")
+        return None
+    _summary_client = OpenAI(api_key=api_key, base_url=SILICONFLOW_BASE_URL)
+    return _summary_client
+def _summarize_table(table_text: str, context_hint: str = "") -> Optional[str]:
+    if not ENABLE_TABLE_SUMMARY:
+        return None
+    client = _get_summary_client()
+    if client is None:
+        return None
+    prompt = f"""Tóm tắt ngắn gọn nội dung bảng sau trong 2-4 câu bằng tiếng Việt.
+Ghi rõ:
+- Bảng này liệt kê/quy định về cái gì
+- Các cột chính trong bảng
+- Thông tin quan trọng (nếu có số liệu cụ thể thì nêu ví dụ)
+{f"Ngữ cảnh: {context_hint}" if context_hint else ""}
+Bảng:
+{table_text[:3000]}
+"""
+    try:
+        response = client.chat.completions.create(
+            model=SUMMARY_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3,
+            max_tokens=1000,
+        )
+        summary = response.choices[0].message.content or ""
+        return summary.strip() if summary.strip() else None
+    except Exception as e:
+        print(f" Table summarization failed: {e}")
+        return None
+def _create_table_nodes(
+    table_text: str,
+    metadata: dict,
+    context_hint: str = ""
+) -> List[TextNode]:
+    # Count rows to decide if we should summarize
+    row_count = table_text.count("\n")
+    if row_count < MIN_TABLE_ROWS_FOR_SUMMARY:
+        # Table too small, just return as-is
+        return [TextNode(text=table_text, metadata={**metadata, "is_table": True})]
+    # Try to generate summary
+    summary = _summarize_table(table_text, context_hint)
+    if summary is None:
+        # Summarization failed, return raw table
+        return [TextNode(text=table_text, metadata={**metadata, "is_table": True})]
+    # Create parent node (raw table - will NOT be embedded)
+    parent_id = str(uuid.uuid4())
+    parent_node = TextNode(
+        text=table_text,
+        metadata={
+            **metadata,
+            "is_table": True,
+            "is_parent": True,  # Flag to skip embedding
+            "node_id": parent_id,
+        }
+    )
+    parent_node.id_ = parent_id
+    # Create summary node (will be embedded for search)
+    summary_node = TextNode(
+        text=summary,
+        metadata={
+            **metadata,
+            "is_table_summary": True,
+            "parent_id": parent_id,  # Link to parent
+        }
+    )
+    print(f"Created summary for table ({row_count} rows)")
+    return [parent_node, summary_node]
 def _enrich_metadata(node: BaseNode, source_path: Path | None) -> None:
     if source_path:
             if (before := content[last_end:match.start()].strip()) and len(before) >= MIN_CHUNK_SIZE:
                 nodes.extend(_chunk_text(before, meta) if len(before) > CHUNK_SIZE else [TextNode(text=before, metadata=meta.copy())])
+            # Table chunks - using Small-to-Big pattern
             if (idx := int(match.group(1))) < len(tables):
                 header, rows = tables[idx]
+                table_chunks = _split_table(header, rows)
+                # Get context hint from header path
+                context_hint = meta.get("Header 1", "") or meta.get("section", "")
+                for i, chunk in enumerate(table_chunks):
+                    chunk_meta = {**meta}
+                    if len(table_chunks) > 1:
+                        chunk_meta["table_part"] = f"{i+1}/{len(table_chunks)}"
+                    # Create parent + summary nodes if applicable
+                    table_nodes = _create_table_nodes(chunk, chunk_meta, context_hint)
+                    nodes.extend(table_nodes)
             last_end = match.end()
         # Text after table

core/embeddings/retrival.py CHANGED Viewed

@@ -174,10 +174,25 @@ class Retriever:
         return self._reranker is not None
     def _to_result(self, doc: Document, rank: int, **extra) -> Dict[str, Any]:
         return {
-            "id": (doc.metadata or {}).get("id"),
-            "content": doc.page_content,
-            "metadata": doc.metadata,
             "final_rank": rank,
             **extra,
         }

         return self._reranker is not None
     def _to_result(self, doc: Document, rank: int, **extra) -> Dict[str, Any]:
+        metadata = doc.metadata or {}
+        content = doc.page_content
+        # Small-to-Big: If this is a summary node, swap with parent (raw table)
+        if metadata.get("is_table_summary") and metadata.get("parent_id"):
+            parent = self._vector_db.get_parent_node(metadata["parent_id"])
+            if parent:
+                content = parent.get("content", content)
+                # Merge metadata, keeping summary info for debugging
+                metadata = {
+                    **parent.get("metadata", {}),
+                    "original_summary": doc.page_content[:200],
+                    "swapped_from_summary": True,
+                }
         return {
+            "id": metadata.get("id"),
+            "content": content,
+            "metadata": metadata,
             "final_rank": rank,
             **extra,
         }

core/embeddings/vector_store.py CHANGED Viewed

@@ -30,6 +30,11 @@ class ChromaVectorDB:
         self.embedder = embedder
         self.config = config or ChromaConfig()
         self._hasher = HashProcessor(verbose=False)
         self._vs = Chroma(
             collection_name=self.config.collection_name,
@@ -37,6 +42,28 @@ class ChromaVectorDB:
             persist_directory=self.config.persist_dir,
         )
         logger.info(f"ChromaVectorDB initialized: {self.config.collection_name}")
     @property
     def collection(self):
@@ -113,13 +140,42 @@ class ChromaVectorDB:
         if ids is not None and len(ids) != len(docs):
             raise ValueError("ids length must match docs length")
-        all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
         bs = max(1, batch_size)
         total = 0
-        for start in range(0, len(docs), bs):
-            batch = docs[start : start + bs]
-            batch_ids = all_ids[start : start + bs]
             lc_docs = self._to_documents(batch, batch_ids)
             try:
@@ -131,7 +187,7 @@ class ChromaVectorDB:
             total += len(batch)
         logger.info(f"Added {total} documents to vector store")
-        return total
     def upsert_documents(
         self,
@@ -146,17 +202,46 @@ class ChromaVectorDB:
         if ids is not None and len(ids) != len(docs):
             raise ValueError("ids length must match docs length")
-        all_ids = list(ids) if ids is not None else [self._doc_id(d) for d in docs]
         bs = max(1, batch_size)
         col = self.collection
         if col is None:
-            return self.add_documents(docs, ids=all_ids, batch_size=bs)
         total = 0
-        for start in range(0, len(docs), bs):
-            batch = docs[start : start + bs]
-            batch_ids = all_ids[start : start + bs]
             lc_docs = self._to_documents(batch, batch_ids)
             texts = [d.page_content for d in lc_docs]
             metas = [d.metadata for d in lc_docs]
@@ -165,7 +250,7 @@ class ChromaVectorDB:
             total += len(batch)
         logger.info(f"Upserted {total} documents to vector store")
-        return total
     def count(self) -> int:
         col = self.collection
@@ -198,3 +283,10 @@ class ChromaVectorDB:
         col.delete(ids=list(ids))
         logger.info(f"Deleted {len(ids)} documents from vector store")
         return len(ids)

         self.embedder = embedder
         self.config = config or ChromaConfig()
         self._hasher = HashProcessor(verbose=False)
+        # Storage for parent nodes (not embedded, used for Small-to-Big retrieval)
+        # Persist to JSON file in same directory as ChromaDB
+        self._parent_nodes_path = Path(self.config.persist_dir) / "parent_nodes.json"
+        self._parent_nodes: Dict[str, Dict[str, Any]] = self._load_parent_nodes()
         self._vs = Chroma(
             collection_name=self.config.collection_name,
             persist_directory=self.config.persist_dir,
         )
         logger.info(f"ChromaVectorDB initialized: {self.config.collection_name}")
+    def _load_parent_nodes(self) -> Dict[str, Dict[str, Any]]:
+        """Load parent nodes from JSON file if exists."""
+        if self._parent_nodes_path.exists():
+            try:
+                with open(self._parent_nodes_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    logger.info(f"Loaded {len(data)} parent nodes from {self._parent_nodes_path}")
+                    return data
+            except Exception as e:
+                logger.warning(f"Failed to load parent nodes: {e}")
+        return {}
+    def _save_parent_nodes(self) -> None:
+        """Save parent nodes to JSON file."""
+        try:
+            self._parent_nodes_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(self._parent_nodes_path, 'w', encoding='utf-8') as f:
+                json.dump(self._parent_nodes, f, ensure_ascii=False, indent=2)
+            logger.info(f"Saved {len(self._parent_nodes)} parent nodes to {self._parent_nodes_path}")
+        except Exception as e:
+            logger.warning(f"Failed to save parent nodes: {e}")
     @property
     def collection(self):
         if ids is not None and len(ids) != len(docs):
             raise ValueError("ids length must match docs length")
+        # Separate parent nodes (not embedded) from regular nodes
+        regular_docs = []
+        regular_ids = []
+        parent_count = 0
+        for i, d in enumerate(docs):
+            normalized = self._normalize_doc(d)
+            md = normalized.get("metadata", {}) or {}
+            doc_id = ids[i] if ids else self._doc_id(d)
+            if md.get("is_parent"):
+                # Store parent node separately (for Small-to-Big retrieval)
+                parent_id = md.get("node_id", doc_id)
+                self._parent_nodes[parent_id] = {
+                    "id": parent_id,
+                    "content": normalized.get("content", ""),
+                    "metadata": md,
+                }
+                parent_count += 1
+            else:
+                regular_docs.append(d)
+                regular_ids.append(doc_id)
+        if parent_count > 0:
+            logger.info(f"Stored {parent_count} parent nodes (not embedded)")
+            self._save_parent_nodes()  # Persist to disk
+        if not regular_docs:
+            return parent_count
         bs = max(1, batch_size)
         total = 0
+        for start in range(0, len(regular_docs), bs):
+            batch = regular_docs[start : start + bs]
+            batch_ids = regular_ids[start : start + bs]
             lc_docs = self._to_documents(batch, batch_ids)
             try:
             total += len(batch)
         logger.info(f"Added {total} documents to vector store")
+        return total + parent_count
     def upsert_documents(
         self,
         if ids is not None and len(ids) != len(docs):
             raise ValueError("ids length must match docs length")
+        # Separate parent nodes (not embedded) from regular nodes
+        regular_docs = []
+        regular_ids = []
+        parent_count = 0
+        for i, d in enumerate(docs):
+            normalized = self._normalize_doc(d)
+            md = normalized.get("metadata", {}) or {}
+            doc_id = ids[i] if ids else self._doc_id(d)
+            if md.get("is_parent"):
+                # Store parent node separately (for Small-to-Big retrieval)
+                parent_id = md.get("node_id", doc_id)
+                self._parent_nodes[parent_id] = {
+                    "id": parent_id,
+                    "content": normalized.get("content", ""),
+                    "metadata": md,
+                }
+                parent_count += 1
+            else:
+                regular_docs.append(d)
+                regular_ids.append(doc_id)
+        if parent_count > 0:
+            logger.info(f"Stored {parent_count} parent nodes (not embedded)")
+            self._save_parent_nodes()  # Persist to disk
+        if not regular_docs:
+            return parent_count
         bs = max(1, batch_size)
         col = self.collection
         if col is None:
+            return self.add_documents(regular_docs, ids=regular_ids, batch_size=bs) + parent_count
         total = 0
+        for start in range(0, len(regular_docs), bs):
+            batch = regular_docs[start : start + bs]
+            batch_ids = regular_ids[start : start + bs]
             lc_docs = self._to_documents(batch, batch_ids)
             texts = [d.page_content for d in lc_docs]
             metas = [d.metadata for d in lc_docs]
             total += len(batch)
         logger.info(f"Upserted {total} documents to vector store")
+        return total + parent_count
     def count(self) -> int:
         col = self.collection
         col.delete(ids=list(ids))
         logger.info(f"Deleted {len(ids)} documents from vector store")
         return len(ids)
+    def get_parent_node(self, parent_id: str) -> Optional[Dict[str, Any]]:
+        return self._parent_nodes.get(parent_id)
+    @property
+    def parent_nodes(self) -> Dict[str, Dict[str, Any]]:
+        return self._parent_nodes

scripts/test_single_file.py ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/env python
+"""Test Small-to-Big với 1 file duy nhất."""
+import sys
+from pathlib import Path
+from dotenv import find_dotenv, load_dotenv
+load_dotenv(find_dotenv(usecwd=True))
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from core.embeddings.chunk import chunk_markdown_file
+from core.embeddings.embedding_model import EmbeddingConfig, QwenEmbeddings
+from core.embeddings.vector_store import ChromaConfig, ChromaVectorDB
+# Test với 1 file chứa nhiều bảng
+TEST_FILE = REPO_ROOT / "data/data_process/quyet_dinh/tieng_anh/06_ Quy định ngoại ngữ từ K70_chính quy_final.md"
+def main():
+    print("=" * 60)
+    print("TEST SMALL-TO-BIG (1 file)")
+    print("=" * 60)
+    # 1. Chunk file
+    print(f"\n[1/4] Chunking: {TEST_FILE.name}")
+    nodes = chunk_markdown_file(TEST_FILE)
+    parent_nodes = [n for n in nodes if n.metadata.get("is_parent")]
+    summary_nodes = [n for n in nodes if n.metadata.get("is_table_summary")]
+    other_nodes = [n for n in nodes if not n.metadata.get("is_parent")]
+    print(f"  Total nodes: {len(nodes)}")
+    print(f"  - Parent nodes (NOT embedded): {len(parent_nodes)}")
+    print(f"  - Summary nodes: {len(summary_nodes)}")
+    print(f"  - Other nodes (text + small tables): {len(other_nodes) - len(summary_nodes)}")
+    # 2. Init DB (với persist_dir tạm)
+    print("\n[2/4] Initializing test DB...")
+    emb_cfg = EmbeddingConfig()
+    emb = QwenEmbeddings(emb_cfg)
+    # Dùng folder tạm để không ảnh hưởng DB chính
+    test_persist = str(REPO_ROOT / "data" / "chroma_test")
+    db_cfg = ChromaConfig(persist_dir=test_persist, collection_name="test_s2b")
+    db = ChromaVectorDB(embedder=emb, config=db_cfg)
+    print(f"  Persist dir: {test_persist}")
+    # 3. Upsert
+    print("\n[3/4] Upserting documents...")
+    count = db.upsert_documents(nodes)
+    print(f"  Upserted: {count}")
+    print(f"  ChromaDB count: {db.count()}")
+    print(f"  Parent nodes stored: {len(db.parent_nodes)}")
+    # Check file JSON
+    json_path = Path(test_persist) / "parent_nodes.json"
+    if json_path.exists():
+        print(f"  ✅ parent_nodes.json exists ({json_path.stat().st_size} bytes)")
+    else:
+        print(f"  ❌ parent_nodes.json NOT found!")
+    # 4. Test retrieval
+    print("\n[4/4] Testing retrieval...")
+    from core.embeddings.retrival import Retriever, RetrievalMode
+    retriever = Retriever(vector_db=db, use_reranker=False)
+    test_query = "TOEIC Nghe 350 điểm tương đương bậc mấy?"
+    print(f"  Query: {test_query}")
+    results = retriever.vector_search(test_query, k=3)
+    for i, r in enumerate(results, 1):
+        meta = r.get("metadata", {})
+        content = r.get("content", "")[:200]
+        print(f"\n  [{i}]")
+        print(f"    is_table_summary: {meta.get('is_table_summary', False)}")
+        print(f"    swapped_from_summary: {meta.get('swapped_from_summary', False)}")
+        print(f"    source: {meta.get('source_file', 'N/A')}")
+        print(f"    content: {content}...")
+    print("\n" + "=" * 60)
+    print("TEST COMPLETE")
+    print("=" * 60)
+    # Cleanup prompt
+    print(f"\nTo cleanup test data: rm -rf {test_persist}")
+if __name__ == "__main__":
+    main()

test/chunk_results.md ADDED Viewed

The diff for this file is too large to render. See raw diff

test/test_small_to_big.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python
+"""Test Small-to-Big table summarization."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from datetime import datetime
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT))
+from dotenv import load_dotenv
+load_dotenv()
+from core.embeddings.chunk import chunk_markdown_file
+def test_chunk_with_summary():
+    """Test chunking a file with tables to verify summary generation."""
+    # Use the K70 English requirements file which has many tables
+    test_file = REPO_ROOT / "data/data_process/quyet_dinh/tieng_anh/06_ Quy định ngoại ngữ từ K70_chính quy_final.md"
+    if not test_file.exists():
+        print(f"❌ Test file not found: {test_file}")
+        return
+    print(f"📄 Processing: {test_file.name}")
+    print("=" * 60)
+    nodes = chunk_markdown_file(test_file)
+    print(f"\n📊 Total nodes: {len(nodes)}")
+    # Count different types
+    parent_nodes = [n for n in nodes if n.metadata.get("is_parent")]
+    summary_nodes = [n for n in nodes if n.metadata.get("is_table_summary")]
+    table_nodes = [n for n in nodes if n.metadata.get("is_table") and not n.metadata.get("is_parent")]
+    text_nodes = [n for n in nodes if not n.metadata.get("is_table") and not n.metadata.get("is_table_summary")]
+    print(f"   - Parent nodes (raw tables, NOT embedded): {len(parent_nodes)}")
+    print(f"   - Summary nodes (embedded for search): {len(summary_nodes)}")
+    print(f"   - Small table nodes (embedded directly): {len(table_nodes)}")
+    print(f"   - Text nodes: {len(text_nodes)}")
+    # Debug: Show sample metadata
+    if nodes:
+        print("\n🔍 Sample metadata from first node:")
+        sample = nodes[0].metadata
+        for k, v in sample.items():
+            print(f"   - {k}: {v}")
+    # Export to markdown
+    output_file = REPO_ROOT / "test" / "chunk_results.md"
+    export_to_markdown(nodes, output_file, test_file.name)
+    print(f"\n📝 Exported detailed results to: {output_file}")
+def export_to_markdown(nodes, output_path: Path, source_name: str):
+    """Export all chunks to a markdown file for review."""
+    lines = [
+        f"# Chunk Results: {source_name}",
+        f"",
+        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        f"",
+        f"## Summary",
+        f"",
+        f"| Type | Count |",
+        f"|------|-------|",
+    ]
+    # Count types
+    parent_nodes = [n for n in nodes if n.metadata.get("is_parent")]
+    summary_nodes = [n for n in nodes if n.metadata.get("is_table_summary")]
+    table_nodes = [n for n in nodes if n.metadata.get("is_table") and not n.metadata.get("is_parent")]
+    text_nodes = [n for n in nodes if not n.metadata.get("is_table") and not n.metadata.get("is_table_summary")]
+    lines.extend([
+        f"| Parent nodes (raw tables, NOT embedded) | {len(parent_nodes)} |",
+        f"| Summary nodes (embedded for search) | {len(summary_nodes)} |",
+        f"| Small table nodes (embedded directly) | {len(table_nodes)} |",
+        f"| Text nodes | {len(text_nodes)} |",
+        f"| **Total** | **{len(nodes)}** |",
+        f"",
+        f"---",
+        f"",
+    ])
+    # Group: Summary nodes with their parents
+    lines.append("## 📝 Summary Nodes (with Parent Tables)")
+    lines.append("")
+    parent_map = {n.metadata.get("node_id"): n for n in parent_nodes}
+    for i, node in enumerate(summary_nodes, 1):
+        parent_id = node.metadata.get("parent_id", "")
+        parent = parent_map.get(parent_id)
+        meta = node.metadata
+        lines.append(f"### Summary #{i}")
+        lines.append(f"")
+        lines.append(f"**Metadata:**")
+        lines.append(f"- is_table_summary: True")
+        lines.append(f"- parent_id: `{parent_id}`")
+        if meta.get("source_file"):
+            lines.append(f"- source_file: {meta.get('source_file')}")
+        if meta.get("applicable_cohorts"):
+            lines.append(f"- applicable_cohorts: {meta.get('applicable_cohorts')}")
+        lines.append(f"")
+        lines.append(f"**Summary Text (embedded for search):**")
+        lines.append(f"")
+        lines.append(f"> {node.get_content()}")
+        lines.append(f"")
+        if parent:
+            lines.append(f"**Parent Table (raw, NOT embedded):**")
+            lines.append(f"")
+            lines.append(f"```markdown")
+            lines.append(parent.get_content())
+            lines.append(f"```")
+            lines.append(f"")
+        lines.append(f"---")
+        lines.append(f"")
+    # Small tables (embedded directly)
+    if table_nodes:
+        lines.append("## 📋 Small Tables (embedded directly)")
+        lines.append("")
+        for i, node in enumerate(table_nodes, 1):
+            meta = node.metadata
+            lines.append(f"### Small Table #{i}")
+            lines.append(f"")
+            lines.append(f"**Metadata:**")
+            lines.append(f"- is_table: True")
+            if meta.get("table_part"):
+                lines.append(f"- table_part: {meta.get('table_part')}")
+            if meta.get("source_file"):
+                lines.append(f"- source_file: {meta.get('source_file')}")
+            if meta.get("applicable_cohorts"):
+                lines.append(f"- applicable_cohorts: {meta.get('applicable_cohorts')}")
+            if meta.get("chunk_index") is not None:
+                lines.append(f"- chunk_index: {meta.get('chunk_index')}")
+            lines.append(f"")
+            lines.append(f"```markdown")
+            lines.append(node.get_content())
+            lines.append(f"```")
+            lines.append(f"")
+            lines.append(f"---")
+            lines.append(f"")
+    # Text nodes
+    lines.append("## 📄 Text Nodes")
+    lines.append("")
+    for i, node in enumerate(text_nodes, 1):
+        content = node.get_content()
+        meta = node.metadata
+        lines.append(f"### Text #{i}")
+        lines.append(f"")
+        lines.append(f"**Metadata:**")
+        if meta.get("document_type"):
+            lines.append(f"- document_type: {meta.get('document_type')}")
+        if meta.get("title"):
+            lines.append(f"- title: {meta.get('title')}")
+        if meta.get("applicable_cohorts"):
+            lines.append(f"- applicable_cohorts: {meta.get('applicable_cohorts')}")
+        if meta.get("source_file"):
+            lines.append(f"- source_file: {meta.get('source_file')}")
+        if meta.get("header_path"):
+            lines.append(f"- header_path: {meta.get('header_path')}")
+        if meta.get("Header 1"):
+            lines.append(f"- Header 1: {meta.get('Header 1')}")
+        if meta.get("chunk_index") is not None:
+            lines.append(f"- chunk_index: {meta.get('chunk_index')}")
+        lines.append(f"")
+        lines.append(f"**Content:**")
+        lines.append(f"")
+        lines.append(content)
+        lines.append(f"")
+        lines.append(f"---")
+        lines.append(f"")
+    # Write to file
+    output_path.write_text("\n".join(lines), encoding="utf-8")
+if __name__ == "__main__":
+    test_chunk_with_summary()