| |
|
|
| import os |
| import json |
| import faiss |
| from sentence_transformers import SentenceTransformer |
|
|
| RAW_DOCS_DIR = "data/docs/raw" |
| INDEX_PATH = "data/docs/docs.index" |
| META_PATH = "data/docs/docs_meta.json" |
|
|
| CHUNK_SIZE = 400 |
|
|
| def chunk_text(text, size): |
| chunks = [] |
| for i in range(0, len(text), size): |
| chunk = text[i:i+size].strip() |
| if chunk: |
| chunks.append(chunk) |
| return chunks |
|
|
| def main(): |
| model = SentenceTransformer("paraphrase-MiniLM-L3-v2", cache_folder="./model_cache") |
|
|
| documents = [] |
| metadata = [] |
|
|
| for fname in os.listdir(RAW_DOCS_DIR): |
| path = os.path.join(RAW_DOCS_DIR, fname) |
| with open(path, "r", encoding="utf-8") as f: |
| text = f.read() |
|
|
| chunks = chunk_text(text, CHUNK_SIZE) |
| for chunk in chunks: |
| documents.append(chunk) |
| metadata.append({ |
| "source_file": fname, |
| "source": f"https://www.jenkins.io/doc/" |
| }) |
|
|
| embeddings = model.encode(documents) |
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
|
|
| os.makedirs("data/docs", exist_ok=True) |
| faiss.write_index(index, INDEX_PATH) |
|
|
| with open(META_PATH, "w", encoding="utf-8") as f: |
| json.dump(metadata, f, indent=2) |
|
|
| print(f"Ingested {len(documents)} document chunks.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|