Spaces:
Sleeping
Sleeping
| """ | |
| Day-3 → Day-4 bridge ingestion script. | |
| Purpose: | |
| Runs the full document ingestion pipeline including loading documents, chunking them, | |
| generating embeddings, and saving the results to a file. Used for processing documents | |
| that will later be uploaded to a vector database. | |
| Pipeline: | |
| 1. Load markdown docs | |
| 2. Chunk them | |
| 3. Generate embeddings (local stub for now) | |
| 4. Save to chunks.jsonl file | |
| Inputs: | |
| docs_dir (str): Path to directory containing markdown documents | |
| provider (str, optional): Embedding provider (default: "local") | |
| dim (int, optional): Embedding dimension (default: 128) | |
| save_to (str, optional): Path to save chunks.jsonl file | |
| Outputs: | |
| Saves embedded chunks to specified file | |
| Returns list of embedded chunks with metadata | |
| Usage: | |
| python scripts/ingest_documents.py /path/to/docs [provider] [dim] | |
| Example: | |
| python scripts/ingest_documents.py ./sample_docs sentence-transformers 384 | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # Add parent directory to path to allow imports | |
| SCRIPT_DIR = Path(__file__).resolve().parent | |
| PROJECT_ROOT = SCRIPT_DIR.parent | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from src.ingestion.load_docs import load_markdown_docs | |
| from src.ingestion.chunker import chunk_documents | |
| from src.ingestion.embeddings import batch_embed_chunks | |
| def run_ingestion(docs_dir: str, provider: str = "local", dim: int = 128, save_to: str = None): | |
| """ | |
| Run full ingestion pipeline: load docs -> chunk -> embed -> optionally save | |
| Args: | |
| docs_dir: Path to directory containing markdown docs | |
| provider: Embedding provider (default: "local") | |
| dim: Embedding dimension (default: 128) | |
| save_to: Optional path to save chunks.jsonl file | |
| Returns: | |
| List of embedded chunks with metadata | |
| """ | |
| import json | |
| docs = load_markdown_docs(docs_dir) | |
| chunks = chunk_documents(docs, max_tokens=300, overlap=50) | |
| embedded = batch_embed_chunks(chunks, provider=provider, dim=dim) | |
| # Merge text back into embedded chunks (embeddings.py strips it) | |
| chunk_map = {(c["filename"], c["chunk_id"]): c["text"] for c in chunks} | |
| for e in embedded: | |
| key = (e["filename"], e["chunk_id"]) | |
| if key in chunk_map: | |
| e["text"] = chunk_map[key] | |
| # Save to file if requested | |
| if save_to: | |
| save_path = Path(save_to) | |
| save_path.parent.mkdir(parents=True, exist_ok=True) | |
| with save_path.open("w", encoding="utf-8") as fh: | |
| for e in embedded: | |
| obj = { | |
| "id": f"{e['filename']}::{e['chunk_id']}", | |
| "filename": e["filename"], | |
| "chunk_id": e["chunk_id"], | |
| "text": e.get("text", ""), | |
| "chars": e.get("chars", 0), | |
| "embedding": e["embedding"] | |
| } | |
| fh.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
| print(f"Saved {len(embedded)} chunks to: {save_to}") | |
| return embedded | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) < 2: | |
| print("Usage: python3 scripts/ingest_documents.py /path/to/docs [provider] [dim]") | |
| raise SystemExit(1) | |
| docs_dir = sys.argv[1] | |
| provider = sys.argv[2] if len(sys.argv) > 2 else "local" | |
| dim = int(sys.argv[3]) if len(sys.argv) > 3 else 128 | |
| # Save to data/chunks.jsonl by default | |
| save_path = str(PROJECT_ROOT / "data" / "chunks.jsonl") | |
| out = run_ingestion(docs_dir, provider=provider, dim=dim, save_to=save_path) | |
| print(f"Total embedded chunks: {len(out)}") |