cv_parser / src /scripts /embed_metiers.py
quentinL52
update
1556508
import json
import os
import sys
# Add src's parent directory to path so we can run this directly if needed
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
def get_job_text(job):
"""
Formats job info using the "Miroir" strategy.
"""
missions = ", ".join(job.get("missions_principales", []))
tech = ", ".join(job.get("competences_techniques", []))
outils = ", ".join(job.get("outils_technologies", []))
soft = ", ".join(job.get("competences_soft", []))
text = f"MISSIONS: {missions}\n"
text += f"TECH_ET_OUTILS: {tech}, {outils}\n"
text += f"SOFT_SKILLS: {soft}"
return text
def embed_metiers_file():
base_path = os.path.dirname(os.path.dirname(__file__))
metiers_path = os.path.join(base_path, "data", "metiers.json")
print(f"Loading {metiers_path}...")
with open(metiers_path, "r", encoding="utf-8") as f:
data = json.load(f)
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
count = 0
def process_jobs(job_list):
nonlocal count
for job in job_list:
if "metiers" in job:
process_jobs(job["metiers"])
elif "id" in job:
print(f"Embedding {job.get('id')}...")
text = get_job_text(job)
emb = embeddings_model.embed_query(text)
job["embedding"] = emb
count += 1
process_jobs(data.get("metiers", []))
print(f"Writing {count} embeddings to {metiers_path}...")
with open(metiers_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Done!")
if __name__ == "__main__":
embed_metiers_file()