rdisipio
wip
e86db37
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script loads ESCO skills from a CSV file and adds them to a ChromaDB vector
store using HuggingFace's sentence-transformers for embedding.
"""
import pandas as pd
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
# Load ESCO skill CSV
skills_df = pd.read_csv("data/esco/skills_en.csv")
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", show_progress=True)
# Convert skills to LangChain documents
documents = [
Document(
page_content=f"{row['conceptUri']}: {row['description']}",
metadata={
"conceptUri": row["conceptUri"],
"preferredLabel": row["preferredLabel"]
}
)
for _, row in skills_df.iterrows()
]
# Create and persist vector store
persist_dir = "data/esco_chroma"
vectorstore = Chroma.from_documents(
documents=documents,
embedding=embedding_model,
persist_directory=persist_dir
)
vectorstore.persist()
print(f"ESCO skill vectors saved to {persist_dir}")