File size: 1,846 Bytes
1556508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import os
import sys

# Add src's parent directory to path so we can run this directly if needed
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

def get_job_text(job):
    """
    Formats job info using the "Miroir" strategy.
    """
    missions = ", ".join(job.get("missions_principales", []))
    tech = ", ".join(job.get("competences_techniques", []))
    outils = ", ".join(job.get("outils_technologies", []))
    soft = ", ".join(job.get("competences_soft", []))
    
    text = f"MISSIONS: {missions}\n"
    text += f"TECH_ET_OUTILS: {tech}, {outils}\n"
    text += f"SOFT_SKILLS: {soft}"
    return text

def embed_metiers_file():
    base_path = os.path.dirname(os.path.dirname(__file__))
    metiers_path = os.path.join(base_path, "data", "metiers.json")
    
    print(f"Loading {metiers_path}...")
    with open(metiers_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        
    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
    
    count = 0
    def process_jobs(job_list):
        nonlocal count
        for job in job_list:
            if "metiers" in job:
                process_jobs(job["metiers"])
            elif "id" in job:
                print(f"Embedding {job.get('id')}...")
                text = get_job_text(job)
                emb = embeddings_model.embed_query(text)
                job["embedding"] = emb
                count += 1

    process_jobs(data.get("metiers", []))
    
    print(f"Writing {count} embeddings to {metiers_path}...")
    with open(metiers_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
    print("Done!")

if __name__ == "__main__":
    embed_metiers_file()