Cohere-Rerank / helpers.py
Sanchayt's picture
Upload 3 files
396e3c2 verified
import random
import time
import os
import faker
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
import pinecone
import tqdm
from datasets import Dataset
fake = faker.Faker()
index_name = "coherererank"
dimension = 1536 # Dimensionality of the ada-002 model
embed_model = "text-embedding-ada-002"
def initialize_pinecone(api_key, env, index_name, dimension):
print("Initializing Pinecone...")
pinecone.init(api_key=api_key, environment=env)
if index_name not in pinecone.list_indexes():
print(f"Creating Pinecone index: {index_name}")
pinecone.create_index(index_name, dimension=dimension, metric="dotproduct")
while not pinecone.describe_index(index_name).status["ready"]:
print("Waiting for index to be ready...")
time.sleep(1)
index = pinecone.Index(index_name)
print("Pinecone initialized successfully!")
return index
def generate_resume():
print("Generating a synthetic resume...")
resume = {
"id": fake.uuid4(),
"text": f"{fake.name()}\n{fake.job()}\n{fake.company()}\n{fake.catch_phrase()}\nSkills: {', '.join(fake.words(ext_word_list=None, unique=True))}\nExperience: {fake.bs()} at {fake.company()} for {random.randint(1, 10)} years.",
"metadata": {
"experience": f"{random.randint(1, 10)} years",
"education": random.choice(["Bachelor's", "Master's", "PhD"]),
},
}
print("Synthetic resume generated successfully!")
return resume
def create_dataset(num_resumes=1000, chunk_size=800):
print("Creating dataset...")
synthetic_resumes = [generate_resume() for _ in range(num_resumes)]
data = []
for resume in synthetic_resumes:
resume_text = resume["text"]
text_chunks = [
resume_text[i : i + chunk_size]
for i in range(0, len(resume_text), chunk_size)
]
for idx, chunk in enumerate(text_chunks):
chunk_id = f'{resume["id"]}-{idx}'
data_entry = {
"id": chunk_id,
"text": chunk,
"metadata": {
"title": "Resume Chunk",
"url": f"https://example.com/resume/{chunk_id}",
"primary_category": "Resume",
"published": "20231028",
"updated": "20231028",
"text": chunk,
},
}
data.append(data_entry)
dataset_dict = {
"id": [item["id"] for item in data],
"text": [item["text"] for item in data],
"metadata": [item["metadata"] for item in data],
}
formatted_dataset = Dataset.from_dict(dataset_dict)
print("Dataset created successfully!")
return formatted_dataset
def embed(docs: list[str]) -> list[list[float]]:
print("Embedding documents...")
res = client.embeddings.create(input=docs, model="text-embedding-3-small")
print("Documents embedded successfully!")
# Assuming the new API response object exposes the embedding directly
return [x.embedding for x in res.data]
def insert_to_pinecone(index, dataset, batch_size=100):
print("Inserting data to Pinecone...")
# Check if the Pinecone index is empty
index_stats = index.describe_index_stats()
if index_stats.total_vector_count > 0:
print("Pinecone index is not empty. No new data will be inserted.")
return
# Fetch existing vector IDs in the index
response = index.fetch(ids=dataset["id"])
existing_ids = set(response.get("id", []))
# Filter out the data that is already in the index
new_data = dataset.filter(lambda example: example["id"] not in existing_ids)
if len(new_data) == 0:
print("All data is already present in the Pinecone index.")
return
# Insert the new data in batches
for i in range(0, len(new_data), batch_size):
batch = new_data[i : i + batch_size]
embeds = embed(batch["text"])
to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
index.upsert(vectors=to_upsert)
print(
f"Batch {i // batch_size + 1}/{(len(new_data) - 1) // batch_size + 1} inserted."
)
print("New data inserted to Pinecone successfully!")
def get_docs(index, query: str, top_k: int):
print("Fetching documents from Pinecone...")
xq = embed([query])[0]
res = index.query(xq, top_k=top_k, include_metadata=True)
docs = {x["metadata"]["text"]: i for i, x in enumerate(res.matches)}
print("Documents fetched successfully!")
return docs
def compare(index, co, query, top_k=25, top_n=3):
# Get vec search results
docs = get_docs(index, query, top_k=top_k)
i2doc = {docs[doc]: doc for doc in docs.keys()}
# Re-rank
rerank_docs = co.rerank(
query=query,
documents=list(docs.keys()),
top_n=top_n,
model="rerank-english-v2.0",
)
comparison_data = []
# Compare order change
for i, doc in enumerate(rerank_docs):
rerank_i = docs[doc.document["text"]]
comparison_data.append(
{
"Original Rank": i,
"Original Text": i2doc[i],
"Reranked Rank": rerank_i,
"Reranked Text": doc.document["text"],
}
)
return comparison_data
def evaluate_resumes(index, co, query, top_k=10, rerank_top_n=5):
print("Evaluating resumes...")
docs = get_docs(index, query, top_k=top_k)
if not docs:
print("No documents found.")
return None, "No documents found."
doc_texts = list(docs.keys())
rerank_response = co.rerank(
query=query,
documents=doc_texts,
top_n=rerank_top_n,
model="rerank-english-v2.0",
)
rerank_docs = [result.document for result in rerank_response.results]
combined_resumes = "\n\n".join([doc["text"] for doc in rerank_docs])
prompt = f"""
You are an HR professional with extensive experience in evaluating resumes for various job roles.This is the task you have been assigned.
Task:
{query}
Based on the resumes provided below, your task is to select the top candidates and provide a detailed justification for each selection, highlighting their skills, experience, and overall fit for a general job role. Focus solely on the evaluation and selection process, and ensure your response is clear, concise, and directly related to the task at hand.
---
Resumes:
{combined_resumes}
---
Please provide your selections and detailed justifications below:
"""
response = co.generate(prompt=prompt)
if response.generations:
print("Resumes evaluated successfully!")
return response.generations[0].text, None
else:
print("Failed to generate a response.")
return None, "Failed to generate a response."
return None, "Failed to generate a response."
return None, "Failed to generate a response."