Spaces:
Sleeping
Sleeping
| """ | |
| Local Codebase Pipeline Runner - Processes local codebases for dataset creation. | |
| This is the main entry point for processing LOCAL CODEBASES (not Git repos). | |
| It orchestrates the entire chunking pipeline for local files, handling both | |
| code files and documentation with intelligent fallback strategies. | |
| ARCHITECTURE POSITION: | |
| - Local Pipeline Orchestrator: Coordinates local file processing | |
| - Fallback Handler: Intelligent fallback from code to documentation | |
| - Dataset Exporter: Creates final JSONL datasets with statistics | |
| KEY FEATURES: | |
| 1. Unified processing of Python files and documentation | |
| 2. Intelligent fallback (failed code chunking → documentation chunking) | |
| 3. Hierarchical chunking for Python files | |
| 4. Documentation-aware chunking for markdown/text files | |
| 5. Dataset statistics and metadata generation | |
| DATA FLOW: | |
| Local files → Type detection → Python chunking (or fallback) → | |
| Documentation chunking → JSONL export → Statistics | |
| USE CASES: | |
| - Processing locally saved code examples | |
| - Creating datasets from example repositories | |
| - Testing chunking strategies on local files | |
| USAGE: | |
| python run_python_pipeline.py --name crewai_examples --include crewai | |
| python run_python_pipeline.py --name test_dataset --exclude large_repos | |
| """ | |
| from pathlib import Path | |
| import json | |
| import argparse | |
| from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker | |
| from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl | |
| from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats | |
| from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata | |
| from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks | |
| INPUT_DIR = Path("data/raw/codebases") | |
| BASE_OUTPUT_DIR = Path("data/processed/chunks") | |
| DOC_EXTS = {".md", ".txt", ".rst"} | |
| def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None): | |
| output_dir = BASE_OUTPUT_DIR / dataset_name | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| chunker = HierarchicalChunker() | |
| all_chunks = [] | |
| files = [p for p in INPUT_DIR.rglob("*") if p.is_file()] | |
| for file_path in files: | |
| rel = file_path.relative_to(INPUT_DIR).parts | |
| if include and rel[0] not in include: | |
| continue | |
| if exclude and rel[0] in exclude: | |
| continue | |
| print(f"Processing: {file_path}") | |
| # ---- Python files ---- | |
| if file_path.suffix == ".py": | |
| try: | |
| code_chunks = chunker.chunk_file(file_path) | |
| if code_chunks: | |
| all_chunks.extend(code_chunks) | |
| continue | |
| except Exception: | |
| pass # fallback to doc mode | |
| # ---- Documentation / text ---- | |
| if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py": | |
| try: | |
| raw_text = file_path.read_text(encoding="utf-8", errors="ignore") | |
| except Exception: | |
| continue | |
| if not raw_text.strip(): | |
| continue | |
| doc_chunks = chunk_document( | |
| raw_text=raw_text, | |
| source_name=str(file_path), | |
| source_url=None, | |
| ) | |
| all_chunks.extend(wrap_doc_chunks(doc_chunks)) | |
| # ---- Export ---- | |
| export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True) | |
| stats = compute_dataset_stats(all_chunks) | |
| primary = [c for c in all_chunks if c.hierarchy.is_primary] | |
| stats["hierarchy"] = { | |
| "primary_chunks": len(primary), | |
| "secondary_chunks": len(all_chunks) - len(primary), | |
| } | |
| with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f: | |
| json.dump(stats, f, indent=2) | |
| write_dataset_metadata( | |
| chunks=all_chunks, | |
| output_path=output_dir / "dataset_metadata.json", | |
| dataset_name=dataset_name, | |
| dataset_version="v1", | |
| ) | |
| print("\n✅ Dataset built successfully") | |
| print(f" - Files: {len({c.file_path for c in all_chunks})}") | |
| print(f" - Chunks: {len(all_chunks)}") | |
| print(f" - Output: {output_dir}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--name", required=True) | |
| parser.add_argument("--include", nargs="+") | |
| parser.add_argument("--exclude", nargs="+") | |
| args = parser.parse_args() | |
| run(args.name, args.include, args.exclude) |