Spaces:
Sleeping
Sleeping
| """ | |
| Clean full report text and chunk documents using multiple strategies. | |
| A. Fixed-size character splitting | |
| B. Recursive character splitting | |
| C. Semantic chunking | |
| D. Parent-child chunking | |
| """ | |
| import json | |
| import os | |
| import re | |
| import pandas as pd | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_text_splitters import ( | |
| CharacterTextSplitter, | |
| MarkdownHeaderTextSplitter, | |
| RecursiveCharacterTextSplitter, | |
| ) | |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| SAMPLE_PATH = os.path.join(BASE_DIR, "data", "processed", "sampled_reports.csv") | |
| OUT_FIXED_PATH = os.path.join(BASE_DIR, "data", "processed", "chunks_fixed.json") | |
| OUT_REC_PATH = os.path.join(BASE_DIR, "data", "processed", "chunks_recursive.json") | |
| OUT_SEM_PATH = os.path.join(BASE_DIR, "data", "processed", "chunks_semantic.json") | |
| OUT_PARENT_PATH = os.path.join(BASE_DIR, "data", "processed", "chunks_parent.json") | |
| def clean_report(text): | |
| if not isinstance(text, str): | |
| return "" | |
| text = re.sub(r"Page \d+\s*of\s*\d+", "", text, flags=re.IGNORECASE) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def build_metadata(row, chunk_idx, strategy): | |
| return { | |
| "chunk_id": f"{row['NtsbNo']}_{strategy}_{chunk_idx:03d}", | |
| "ntsb_no": str(row["NtsbNo"]), | |
| "event_date": str(row["EventDate"]), | |
| "state": str(row.get("State", "")), | |
| "make": str(row.get("Make", "")), | |
| "model": str(row.get("Model", "")), | |
| "phase_of_flight": str(row.get("BroadPhaseofFlight", "")), | |
| "weather": str(row.get("WeatherCondition", "")), | |
| } | |
| def chunk_fixed(df): | |
| """Strategy A: Baseline fixed-size character splitting.""" | |
| splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=200, separator="") | |
| chunks = [] | |
| for _, row in df.iterrows(): | |
| text = clean_report(row["rep_text"]) | |
| header = ( | |
| f"Accident {row['NtsbNo']} ({row.get('Make', '')} {row.get('Model', '')}, " | |
| f"{row.get('EventDate', '')[:10]}): " | |
| ) | |
| for i, chunk_text in enumerate(splitter.split_text(text)): | |
| chunk_data = build_metadata(row, i, "fixed") | |
| chunk_data["text"] = header + chunk_text | |
| chunks.append(chunk_data) | |
| return chunks | |
| def chunk_recursive(df): | |
| """Strategy B: Baseline recursive character splitting.""" | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| chunks = [] | |
| for _, row in df.iterrows(): | |
| text = clean_report(row["rep_text"]) | |
| header = ( | |
| f"Accident {row['NtsbNo']} ({row.get('Make', '')} {row.get('Model', '')}, " | |
| f"{row.get('EventDate', '')[:10]}): " | |
| ) | |
| for i, chunk_text in enumerate(splitter.split_text(text)): | |
| chunk_data = build_metadata(row, i, "rec") | |
| chunk_data["text"] = header + chunk_text | |
| chunks.append(chunk_data) | |
| return chunks | |
| def chunk_semantic(df): | |
| """Strategy C: Semantic chunking using embedding breakpoints.""" | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| semantic_chunker = SemanticChunker(embeddings, breakpoint_threshold_type="percentile") | |
| chunks = [] | |
| for idx, row in df.iterrows(): | |
| text = clean_report(row["rep_text"]) | |
| header = ( | |
| f"Accident {row['NtsbNo']} ({row.get('Make', '')} {row.get('Model', '')}, " | |
| f"{row.get('EventDate', '')[:10]}): " | |
| ) | |
| if len(text) < 100: | |
| doc_chunks = [text] | |
| else: | |
| try: | |
| doc_chunks = semantic_chunker.split_text(text) | |
| except Exception as e: | |
| print( | |
| f"Warning: Semantic split failed for {row['NtsbNo']}, " | |
| f"falling back to recursive. Error: {e}" | |
| ) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) | |
| doc_chunks = splitter.split_text(text) | |
| for i, chunk_text in enumerate(doc_chunks): | |
| chunk_data = build_metadata(row, i, "sem") | |
| chunk_data["text"] = header + chunk_text | |
| chunks.append(chunk_data) | |
| if (idx + 1) % 10 == 0: | |
| print(f" Processed {idx + 1}/{len(df)} reports semantically...") | |
| return chunks | |
| def chunk_markdown_section_aware(md_file_path: str): | |
| """Section-aware chunking for markdown reports with full metadata attachment.""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| sections = re.split(r"\n(##\s+.*?)\n", content) | |
| parsed_sections = [] | |
| if sections and sections[0].strip(): | |
| parsed_sections.append({"title": "Introduction/Header", "content": sections[0].strip()}) | |
| for i in range(1, len(sections), 2): | |
| header = sections[i].replace("##", "").strip() | |
| text = sections[i + 1].strip() if i + 1 < len(sections) else "" | |
| if text: | |
| parsed_sections.append({"title": header, "content": text}) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| chunks = [] | |
| report_id = os.path.basename(md_file_path).replace(".md", "") | |
| # Extract metadata directly from markdown content. | |
| # Parse the first section to find aircraft type, date, location, etc. | |
| first_section_text = (sections[0] if sections and sections[0].strip() else "") + ( | |
| sections[2] if len(sections) > 2 else "" | |
| ) | |
| # Extract NTSB number (format: "NTSB/AAR-YY/NN" or "DCA...") | |
| ntsb_match = re.search(r"(NTSB/\w+-\d+/\d+|DCA\d+\w+\d+)", first_section_text) | |
| ntsb_no = ntsb_match.group(1) if ntsb_match else report_id | |
| # Extract aircraft type (Boeing 747-300, Cessna 172, MD-80, etc.) | |
| aircraft_match = re.search( | |
| r'(Boeing|Airbus|Cessna|Piper|Beechcraft|Embraer|Bombardier|McDonnell Douglas|Douglas)\s+(\w+[\-\w]*)', | |
| first_section_text, | |
| re.IGNORECASE | |
| ) | |
| make = aircraft_match.group(1) if aircraft_match else "unknown" | |
| model = aircraft_match.group(2) if aircraft_match else "unknown" | |
| # Extract date (format: "August 6, 1997" or "2022-09-04") | |
| date_match = re.search( | |
| r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+,?\s+\d{4}|\d{4}-\d{2}-\d{2}', | |
| first_section_text | |
| ) | |
| event_date = date_match.group(0) if date_match else "unknown" | |
| # Extract location/state | |
| state_match = re.search(r'(?:Guam|Hawaii|Alaska|California|Texas|Florida|New York|Colorado|Alaska|Washington|Oregon|Arizona|Nevada|Utah|Wyoming|Montana|Idaho|North Dakota|South Dakota|Nebraska|Kansas|Oklahoma|Texas|Minnesota|Wisconsin|Michigan|Illinois|Indiana|Ohio|Pennsylvania|New York|Vermont|New Hampshire|Maine|Massachusetts|Rhode Island|Connecticut|New Jersey|Delaware|Maryland|Virginia|West Virginia|North Carolina|South Carolina|Georgia|Florida|Alabama|Mississippi|Louisiana|Arkansas|Missouri|Iowa|Tennessee|Kentucky|District of Columbia|Puerto Rico|Virgin Islands|Guam|American Samoa)\b', first_section_text, re.IGNORECASE) | |
| state = state_match.group(0) if state_match else "unknown" | |
| # Prepare metadata dict | |
| metadata = { | |
| "ntsb_no": ntsb_no, | |
| "event_date": event_date, | |
| "make": make, | |
| "model": model, | |
| "phase_of_flight": "unknown", # Not typically in markdown header | |
| "weather": "unknown", # Not typically in markdown header | |
| "state": state, | |
| } | |
| for sec_idx, section in enumerate(parsed_sections): | |
| if not section["content"] or re.match(r"^[\W_]+$", section["content"]): | |
| continue | |
| for chunk_idx, chunk_text in enumerate(splitter.split_text(section["content"])): | |
| base_chunk = { | |
| "chunk_id": f"{report_id}_sec{sec_idx:02d}_{chunk_idx:03d}", | |
| "report_id": report_id, | |
| "section_title": section["title"], | |
| "text": f"Section: {section['title']}\n{chunk_text}", | |
| } | |
| # Attach full metadata from CSV match | |
| base_chunk.update(metadata) | |
| chunks.append(base_chunk) | |
| return chunks | |
| def chunk_markdown_recursive(md_file_path: str): | |
| """Recursive markdown chunking without section boundaries.""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| chunks = [] | |
| report_id = os.path.basename(md_file_path).replace(".md", "") | |
| for chunk_idx, chunk_text in enumerate(splitter.split_text(content)): | |
| chunks.append( | |
| { | |
| "chunk_id": f"{report_id}_rec_{chunk_idx:03d}", | |
| "report_id": report_id, | |
| "text": chunk_text, | |
| } | |
| ) | |
| return chunks | |
| def _extract_md_report_metadata(content: str, report_id: str) -> dict: | |
| """Extract coarse report metadata directly from markdown content.""" | |
| ntsb_match = re.search(r"(NTSB/\w+-\d+/\d+|DCA\d+\w+\d+)", content) | |
| ntsb_no = ntsb_match.group(1) if ntsb_match else report_id | |
| aircraft_match = re.search( | |
| r"(Boeing|Airbus|Cessna|Piper|Beechcraft|Embraer|Bombardier|McDonnell Douglas|Douglas)\s+(\w+[\-\w]*)", | |
| content, | |
| re.IGNORECASE, | |
| ) | |
| make = aircraft_match.group(1) if aircraft_match else "unknown" | |
| model = aircraft_match.group(2) if aircraft_match else "unknown" | |
| date_match = re.search( | |
| r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+,?\s+\d{4}|\d{4}-\d{2}-\d{2}", | |
| content, | |
| ) | |
| event_date = date_match.group(0) if date_match else "unknown" | |
| state_match = re.search( | |
| r"(?:Guam|Hawaii|Alaska|California|Texas|Florida|New York|Colorado|Washington|Oregon|Arizona|Nevada|Utah|Wyoming|Montana|Idaho|North Dakota|South Dakota|Nebraska|Kansas|Oklahoma|Minnesota|Wisconsin|Michigan|Illinois|Indiana|Ohio|Pennsylvania|Vermont|New Hampshire|Maine|Massachusetts|Rhode Island|Connecticut|New Jersey|Delaware|Maryland|Virginia|West Virginia|North Carolina|South Carolina|Georgia|Alabama|Mississippi|Louisiana|Arkansas|Missouri|Iowa|Tennessee|Kentucky|District of Columbia|Puerto Rico|Virgin Islands|American Samoa)\b", | |
| content, | |
| re.IGNORECASE, | |
| ) | |
| state = state_match.group(0) if state_match else "unknown" | |
| return { | |
| "ntsb_no": ntsb_no, | |
| "event_date": event_date, | |
| "make": make, | |
| "model": model, | |
| "phase_of_flight": "unknown", | |
| "weather": "unknown", | |
| "state": state, | |
| } | |
| def _token_window_chunks(text: str, chunk_tokens: int = 192, overlap_tokens: int = 32) -> list[str]: | |
| """Split text into approximate token windows using whitespace tokenization.""" | |
| words = text.split() | |
| if not words: | |
| return [] | |
| chunks = [] | |
| step = max(1, chunk_tokens - overlap_tokens) | |
| for start in range(0, len(words), step): | |
| window = words[start : start + chunk_tokens] | |
| if not window: | |
| break | |
| chunks.append(" ".join(window)) | |
| if start + chunk_tokens >= len(words): | |
| break | |
| return chunks | |
| def _sentence_split(text: str) -> list[str]: | |
| parts = re.split(r"(?<=[.!?])\s+", text.strip()) | |
| return [p.strip() for p in parts if p and p.strip()] | |
| def _rebalance_to_token_bounds( | |
| pieces: list[str], | |
| min_tokens: int = 512, | |
| max_tokens: int = 1024, | |
| target_tokens: int = 768, | |
| ) -> list[str]: | |
| """Merge/split text pieces into chunks constrained to token bounds (approx via whitespace tokens).""" | |
| out: list[str] = [] | |
| buffer: list[str] = [] | |
| buffer_tokens = 0 | |
| def flush_buffer() -> None: | |
| nonlocal buffer, buffer_tokens | |
| if buffer: | |
| out.append(" ".join(buffer).strip()) | |
| buffer = [] | |
| buffer_tokens = 0 | |
| for piece in pieces: | |
| if not piece: | |
| continue | |
| words = piece.split() | |
| if not words: | |
| continue | |
| # Split oversized piece first. | |
| if len(words) > max_tokens: | |
| if buffer_tokens >= min_tokens: | |
| flush_buffer() | |
| step = target_tokens | |
| start = 0 | |
| while start < len(words): | |
| window = words[start : start + max_tokens] | |
| out.append(" ".join(window)) | |
| start += step | |
| continue | |
| if buffer_tokens + len(words) <= max_tokens: | |
| buffer.append(piece) | |
| buffer_tokens += len(words) | |
| if buffer_tokens >= target_tokens: | |
| flush_buffer() | |
| continue | |
| # Buffer would overflow with this piece. | |
| if buffer_tokens < min_tokens and buffer: | |
| merged = " ".join(buffer + [piece]).split() | |
| start = 0 | |
| while start < len(merged): | |
| window = merged[start : start + max_tokens] | |
| out.append(" ".join(window)) | |
| start += target_tokens | |
| buffer = [] | |
| buffer_tokens = 0 | |
| else: | |
| flush_buffer() | |
| buffer.append(piece) | |
| buffer_tokens = len(words) | |
| flush_buffer() | |
| # Join trailing small chunk if possible. | |
| if len(out) >= 2: | |
| last_tokens = len(out[-1].split()) | |
| prev_tokens = len(out[-2].split()) | |
| if last_tokens < min_tokens and (last_tokens + prev_tokens) <= max_tokens: | |
| out[-2] = f"{out[-2]} {out[-1]}".strip() | |
| out.pop() | |
| return [c for c in out if c] | |
| def _baseline_report_meta(md_file_path: str, content: str) -> dict: | |
| report_id = os.path.basename(md_file_path).replace(".md", "") | |
| meta = _extract_md_report_metadata(content, report_id) | |
| return { | |
| "report_id": report_id, | |
| "ntsb_no": meta.get("ntsb_no", report_id), | |
| "event_date": meta.get("event_date", "unknown"), | |
| "make": meta.get("make", "unknown"), | |
| "model": meta.get("model", "unknown"), | |
| } | |
| def chunk_markdown_baseline_fixed( | |
| md_file_path: str, | |
| chunk_tokens: int = 768, | |
| overlap_tokens: int = 128, | |
| ) -> list[dict]: | |
| """Baseline fixed chunking over markdown with 512-1024 token windows (approx).""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| report_meta = _baseline_report_meta(md_file_path, content) | |
| report_id = report_meta["report_id"] | |
| windows = _token_window_chunks(content, chunk_tokens=chunk_tokens, overlap_tokens=overlap_tokens) | |
| chunks = [] | |
| for i, text in enumerate(windows): | |
| token_count = len(text.split()) | |
| if token_count > 1024: | |
| text = " ".join(text.split()[:1024]) | |
| chunks.append( | |
| { | |
| "chunk_id": f"{report_id}_base_fixed_{i:03d}", | |
| "section_title": "Document", | |
| "text": text, | |
| **report_meta, | |
| } | |
| ) | |
| return chunks | |
| def chunk_markdown_baseline_recursive(md_file_path: str) -> list[dict]: | |
| """Baseline recursive chunking (non-markdown-aware) constrained to 512-1024 tokens.""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| report_meta = _baseline_report_meta(md_file_path, content) | |
| report_id = report_meta["report_id"] | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=5200, | |
| chunk_overlap=700, | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| ) | |
| raw_chunks = splitter.split_text(content) | |
| bounded = _rebalance_to_token_bounds(raw_chunks, min_tokens=512, max_tokens=1024, target_tokens=768) | |
| chunks = [] | |
| for i, text in enumerate(bounded): | |
| chunks.append( | |
| { | |
| "chunk_id": f"{report_id}_base_rec_{i:03d}", | |
| "section_title": "Document", | |
| "text": text, | |
| **report_meta, | |
| } | |
| ) | |
| return chunks | |
| _cached_hf_embeddings = None | |
| def _get_hf_embeddings(): | |
| """Return a cached HuggingFaceEmbeddings instance (loaded once).""" | |
| global _cached_hf_embeddings | |
| if _cached_hf_embeddings is None: | |
| _cached_hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| return _cached_hf_embeddings | |
| def chunk_markdown_baseline_semantic(md_file_path: str) -> list[dict]: | |
| """Baseline semantic chunking constrained to 512-1024 tokens.""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| report_meta = _baseline_report_meta(md_file_path, content) | |
| report_id = report_meta["report_id"] | |
| embeddings = _get_hf_embeddings() | |
| semantic_chunker = SemanticChunker(embeddings, breakpoint_threshold_type="percentile") | |
| try: | |
| raw_chunks = semantic_chunker.split_text(content) | |
| except Exception: | |
| # Keep baseline stable when semantic splitter fails on edge cases. | |
| return chunk_markdown_baseline_recursive(md_file_path) | |
| if not raw_chunks: | |
| return chunk_markdown_baseline_recursive(md_file_path) | |
| bounded = _rebalance_to_token_bounds(raw_chunks, min_tokens=512, max_tokens=1024, target_tokens=768) | |
| chunks = [] | |
| for i, text in enumerate(bounded): | |
| chunks.append( | |
| { | |
| "chunk_id": f"{report_id}_base_sem_{i:03d}", | |
| "section_title": "Document", | |
| "text": text, | |
| **report_meta, | |
| } | |
| ) | |
| return chunks | |
| def chunk_markdown_md_recursive(md_file_path: str): | |
| """md_recursive strategy: header-aware splitting then recursive chunking. | |
| Chunk size is 4x larger than the original (2048 chars, ~512 tokens), | |
| with a hard 512-token cap enforced by _rebalance_to_token_bounds. | |
| """ | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| report_id = os.path.basename(md_file_path).replace(".md", "") | |
| report_meta = _extract_md_report_metadata(content, report_id) | |
| headers_to_split_on = [ | |
| ("#", "h1"), | |
| ("##", "h2"), | |
| ("###", "h3"), | |
| ] | |
| header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
| section_docs = header_splitter.split_text(content) | |
| # 4x bigger: chunk_size 512 → 2048, overlap 50 → 200 | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=2048, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| chunks = [] | |
| for sec_idx, doc in enumerate(section_docs): | |
| section_title = ( | |
| doc.metadata.get("h3") | |
| or doc.metadata.get("h2") | |
| or doc.metadata.get("h1") | |
| or "Unknown Section" | |
| ) | |
| section_text = (doc.page_content or "").strip() | |
| if not section_text: | |
| continue | |
| raw_sub_chunks = splitter.split_text(section_text) | |
| # Enforce hard 512-token maximum per chunk | |
| bounded = _rebalance_to_token_bounds( | |
| raw_sub_chunks, min_tokens=64, max_tokens=512, target_tokens=384 | |
| ) | |
| for chunk_idx, chunk_text in enumerate(bounded): | |
| item = { | |
| "chunk_id": f"{report_id}_mdrec_{sec_idx:03d}_{chunk_idx:03d}", | |
| "report_id": report_id, | |
| "section_title": section_title, | |
| "text": chunk_text, | |
| } | |
| item.update(report_meta) | |
| chunks.append(item) | |
| return chunks | |
| def chunk_markdown_parent_child(md_file_path: str): | |
| """parent_child strategy: header section as parent, token windows as children.""" | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| report_id = os.path.basename(md_file_path).replace(".md", "") | |
| report_meta = _extract_md_report_metadata(content, report_id) | |
| headers_to_split_on = [ | |
| ("#", "h1"), | |
| ("##", "h2"), | |
| ("###", "h3"), | |
| ] | |
| header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
| section_docs = header_splitter.split_text(content) | |
| chunks = [] | |
| for sec_idx, doc in enumerate(section_docs): | |
| section_title = ( | |
| doc.metadata.get("h3") | |
| or doc.metadata.get("h2") | |
| or doc.metadata.get("h1") | |
| or "Unknown Section" | |
| ) | |
| parent_text = (doc.page_content or "").strip() | |
| if not parent_text: | |
| continue | |
| parent_id = f"{report_id}_parent_{sec_idx:03d}" | |
| child_chunks = _token_window_chunks(parent_text, chunk_tokens=192, overlap_tokens=32) | |
| for child_idx, child_text in enumerate(child_chunks): | |
| item = { | |
| "chunk_id": f"{report_id}_pchild_{sec_idx:03d}_{child_idx:03d}", | |
| "parent_id": parent_id, | |
| "report_id": report_id, | |
| "section_title": section_title, | |
| "text": child_text, | |
| "parent_text": parent_text, | |
| } | |
| item.update(report_meta) | |
| chunks.append(item) | |
| return chunks | |
| # Backward-compatible wrappers used by existing ingestion code. | |
| def chunk_markdown_section_aware(md_file_path: str): | |
| return chunk_markdown_md_recursive(md_file_path) | |
| def chunk_markdown_recursive(md_file_path: str): | |
| return chunk_markdown_md_recursive(md_file_path) | |
| def chunk_parent(df): | |
| """Strategy D: Parent-child chunking for richer generation context.""" | |
| parent_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1800, | |
| chunk_overlap=250, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| child_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=80, | |
| separators=["\n\n", "\n", ". ", " "], | |
| ) | |
| chunks = [] | |
| for _, row in df.iterrows(): | |
| text = clean_report(row["rep_text"]) | |
| header = ( | |
| f"Accident {row['NtsbNo']} ({row.get('Make', '')} {row.get('Model', '')}, " | |
| f"{row.get('EventDate', '')[:10]}): " | |
| ) | |
| parent_chunks = parent_splitter.split_text(text) | |
| for p_idx, parent_text in enumerate(parent_chunks): | |
| parent_id = f"{row['NtsbNo']}_parent_{p_idx:03d}" | |
| child_chunks = child_splitter.split_text(parent_text) | |
| for c_idx, child_text in enumerate(child_chunks): | |
| chunk_data = build_metadata(row, c_idx, f"parent{p_idx:03d}") | |
| chunk_data["text"] = header + child_text | |
| chunk_data["parent_id"] = parent_id | |
| chunk_data["parent_text"] = header + parent_text | |
| chunks.append(chunk_data) | |
| return chunks | |
| def main(): | |
| print(f"Loading data from {SAMPLE_PATH}") | |
| df = pd.read_csv(SAMPLE_PATH, sep=";", encoding="utf-8") | |
| print(f"Loaded {len(df)} reports.") | |
| print("\nRunning Strategy A: Fixed-Size Chunking...") | |
| chunks_fixed = chunk_fixed(df) | |
| with open(OUT_FIXED_PATH, "w", encoding="utf-8") as f: | |
| json.dump(chunks_fixed, f, indent=2) | |
| print(f" -> Generated {len(chunks_fixed)} fixed chunks") | |
| print("\nRunning Strategy B: Recursive Character Chunking...") | |
| chunks_rec = chunk_recursive(df) | |
| with open(OUT_REC_PATH, "w", encoding="utf-8") as f: | |
| json.dump(chunks_rec, f, indent=2) | |
| print(f" -> Generated {len(chunks_rec)} recursive chunks") | |
| print("\nRunning Strategy C: Semantic Chunking...") | |
| chunks_sem = chunk_semantic(df) | |
| with open(OUT_SEM_PATH, "w", encoding="utf-8") as f: | |
| json.dump(chunks_sem, f, indent=2) | |
| print(f" -> Generated {len(chunks_sem)} semantic chunks") | |
| print("\nRunning Strategy D: Parent-Child Chunking...") | |
| chunks_parent = chunk_parent(df) | |
| with open(OUT_PARENT_PATH, "w", encoding="utf-8") as f: | |
| json.dump(chunks_parent, f, indent=2) | |
| print(f" -> Generated {len(chunks_parent)} parent-child chunks") | |
| print("\nDone! Ready for embeddings.") | |
| if __name__ == "__main__": | |
| main() | |