import random import time import os import faker from openai import OpenAI from dotenv import load_dotenv load_dotenv() client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) import pinecone import tqdm from datasets import Dataset fake = faker.Faker() index_name = "coherererank" dimension = 1536 # Dimensionality of the ada-002 model embed_model = "text-embedding-ada-002" def initialize_pinecone(api_key, env, index_name, dimension): print("Initializing Pinecone...") pinecone.init(api_key=api_key, environment=env) if index_name not in pinecone.list_indexes(): print(f"Creating Pinecone index: {index_name}") pinecone.create_index(index_name, dimension=dimension, metric="dotproduct") while not pinecone.describe_index(index_name).status["ready"]: print("Waiting for index to be ready...") time.sleep(1) index = pinecone.Index(index_name) print("Pinecone initialized successfully!") return index def generate_resume(): print("Generating a synthetic resume...") resume = { "id": fake.uuid4(), "text": f"{fake.name()}\n{fake.job()}\n{fake.company()}\n{fake.catch_phrase()}\nSkills: {', '.join(fake.words(ext_word_list=None, unique=True))}\nExperience: {fake.bs()} at {fake.company()} for {random.randint(1, 10)} years.", "metadata": { "experience": f"{random.randint(1, 10)} years", "education": random.choice(["Bachelor's", "Master's", "PhD"]), }, } print("Synthetic resume generated successfully!") return resume def create_dataset(num_resumes=1000, chunk_size=800): print("Creating dataset...") synthetic_resumes = [generate_resume() for _ in range(num_resumes)] data = [] for resume in synthetic_resumes: resume_text = resume["text"] text_chunks = [ resume_text[i : i + chunk_size] for i in range(0, len(resume_text), chunk_size) ] for idx, chunk in enumerate(text_chunks): chunk_id = f'{resume["id"]}-{idx}' data_entry = { "id": chunk_id, "text": chunk, "metadata": { "title": "Resume Chunk", "url": f"https://example.com/resume/{chunk_id}", "primary_category": "Resume", "published": "20231028", "updated": "20231028", "text": chunk, }, } data.append(data_entry) dataset_dict = { "id": [item["id"] for item in data], "text": [item["text"] for item in data], "metadata": [item["metadata"] for item in data], } formatted_dataset = Dataset.from_dict(dataset_dict) print("Dataset created successfully!") return formatted_dataset def embed(docs: list[str]) -> list[list[float]]: print("Embedding documents...") res = client.embeddings.create(input=docs, model="text-embedding-3-small") print("Documents embedded successfully!") # Assuming the new API response object exposes the embedding directly return [x.embedding for x in res.data] def insert_to_pinecone(index, dataset, batch_size=100): print("Inserting data to Pinecone...") # Check if the Pinecone index is empty index_stats = index.describe_index_stats() if index_stats.total_vector_count > 0: print("Pinecone index is not empty. No new data will be inserted.") return # Fetch existing vector IDs in the index response = index.fetch(ids=dataset["id"]) existing_ids = set(response.get("id", [])) # Filter out the data that is already in the index new_data = dataset.filter(lambda example: example["id"] not in existing_ids) if len(new_data) == 0: print("All data is already present in the Pinecone index.") return # Insert the new data in batches for i in range(0, len(new_data), batch_size): batch = new_data[i : i + batch_size] embeds = embed(batch["text"]) to_upsert = list(zip(batch["id"], embeds, batch["metadata"])) index.upsert(vectors=to_upsert) print( f"Batch {i // batch_size + 1}/{(len(new_data) - 1) // batch_size + 1} inserted." ) print("New data inserted to Pinecone successfully!") def get_docs(index, query: str, top_k: int): print("Fetching documents from Pinecone...") xq = embed([query])[0] res = index.query(xq, top_k=top_k, include_metadata=True) docs = {x["metadata"]["text"]: i for i, x in enumerate(res.matches)} print("Documents fetched successfully!") return docs def compare(index, co, query, top_k=25, top_n=3): # Get vec search results docs = get_docs(index, query, top_k=top_k) i2doc = {docs[doc]: doc for doc in docs.keys()} # Re-rank rerank_docs = co.rerank( query=query, documents=list(docs.keys()), top_n=top_n, model="rerank-english-v2.0", ) comparison_data = [] # Compare order change for i, doc in enumerate(rerank_docs): rerank_i = docs[doc.document["text"]] comparison_data.append( { "Original Rank": i, "Original Text": i2doc[i], "Reranked Rank": rerank_i, "Reranked Text": doc.document["text"], } ) return comparison_data def evaluate_resumes(index, co, query, top_k=10, rerank_top_n=5): print("Evaluating resumes...") docs = get_docs(index, query, top_k=top_k) if not docs: print("No documents found.") return None, "No documents found." doc_texts = list(docs.keys()) rerank_response = co.rerank( query=query, documents=doc_texts, top_n=rerank_top_n, model="rerank-english-v2.0", ) rerank_docs = [result.document for result in rerank_response.results] combined_resumes = "\n\n".join([doc["text"] for doc in rerank_docs]) prompt = f""" You are an HR professional with extensive experience in evaluating resumes for various job roles.This is the task you have been assigned. Task: {query} Based on the resumes provided below, your task is to select the top candidates and provide a detailed justification for each selection, highlighting their skills, experience, and overall fit for a general job role. Focus solely on the evaluation and selection process, and ensure your response is clear, concise, and directly related to the task at hand. --- Resumes: {combined_resumes} --- Please provide your selections and detailed justifications below: """ response = co.generate(prompt=prompt) if response.generations: print("Resumes evaluated successfully!") return response.generations[0].text, None else: print("Failed to generate a response.") return None, "Failed to generate a response." return None, "Failed to generate a response." return None, "Failed to generate a response."