Spaces:
Running
Running
| import json | |
| import os | |
| import sys | |
| # Add src's parent directory to path so we can run this directly if needed | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) | |
| from langchain_openai import OpenAIEmbeddings | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def get_job_text(job): | |
| """ | |
| Formats job info using the "Miroir" strategy. | |
| """ | |
| missions = ", ".join(job.get("missions_principales", [])) | |
| tech = ", ".join(job.get("competences_techniques", [])) | |
| outils = ", ".join(job.get("outils_technologies", [])) | |
| soft = ", ".join(job.get("competences_soft", [])) | |
| text = f"MISSIONS: {missions}\n" | |
| text += f"TECH_ET_OUTILS: {tech}, {outils}\n" | |
| text += f"SOFT_SKILLS: {soft}" | |
| return text | |
| def embed_metiers_file(): | |
| base_path = os.path.dirname(os.path.dirname(__file__)) | |
| metiers_path = os.path.join(base_path, "data", "metiers.json") | |
| print(f"Loading {metiers_path}...") | |
| with open(metiers_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small") | |
| count = 0 | |
| def process_jobs(job_list): | |
| nonlocal count | |
| for job in job_list: | |
| if "metiers" in job: | |
| process_jobs(job["metiers"]) | |
| elif "id" in job: | |
| print(f"Embedding {job.get('id')}...") | |
| text = get_job_text(job) | |
| emb = embeddings_model.embed_query(text) | |
| job["embedding"] = emb | |
| count += 1 | |
| process_jobs(data.get("metiers", [])) | |
| print(f"Writing {count} embeddings to {metiers_path}...") | |
| with open(metiers_path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| print("Done!") | |
| if __name__ == "__main__": | |
| embed_metiers_file() | |