Spaces:
Runtime error
Runtime error
| import random | |
| import time | |
| import os | |
| import faker | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| import pinecone | |
| import tqdm | |
| from datasets import Dataset | |
| fake = faker.Faker() | |
| index_name = "coherererank" | |
| dimension = 1536 # Dimensionality of the ada-002 model | |
| embed_model = "text-embedding-ada-002" | |
| def initialize_pinecone(api_key, env, index_name, dimension): | |
| print("Initializing Pinecone...") | |
| pinecone.init(api_key=api_key, environment=env) | |
| if index_name not in pinecone.list_indexes(): | |
| print(f"Creating Pinecone index: {index_name}") | |
| pinecone.create_index(index_name, dimension=dimension, metric="dotproduct") | |
| while not pinecone.describe_index(index_name).status["ready"]: | |
| print("Waiting for index to be ready...") | |
| time.sleep(1) | |
| index = pinecone.Index(index_name) | |
| print("Pinecone initialized successfully!") | |
| return index | |
| def generate_resume(): | |
| print("Generating a synthetic resume...") | |
| resume = { | |
| "id": fake.uuid4(), | |
| "text": f"{fake.name()}\n{fake.job()}\n{fake.company()}\n{fake.catch_phrase()}\nSkills: {', '.join(fake.words(ext_word_list=None, unique=True))}\nExperience: {fake.bs()} at {fake.company()} for {random.randint(1, 10)} years.", | |
| "metadata": { | |
| "experience": f"{random.randint(1, 10)} years", | |
| "education": random.choice(["Bachelor's", "Master's", "PhD"]), | |
| }, | |
| } | |
| print("Synthetic resume generated successfully!") | |
| return resume | |
| def create_dataset(num_resumes=1000, chunk_size=800): | |
| print("Creating dataset...") | |
| synthetic_resumes = [generate_resume() for _ in range(num_resumes)] | |
| data = [] | |
| for resume in synthetic_resumes: | |
| resume_text = resume["text"] | |
| text_chunks = [ | |
| resume_text[i : i + chunk_size] | |
| for i in range(0, len(resume_text), chunk_size) | |
| ] | |
| for idx, chunk in enumerate(text_chunks): | |
| chunk_id = f'{resume["id"]}-{idx}' | |
| data_entry = { | |
| "id": chunk_id, | |
| "text": chunk, | |
| "metadata": { | |
| "title": "Resume Chunk", | |
| "url": f"https://example.com/resume/{chunk_id}", | |
| "primary_category": "Resume", | |
| "published": "20231028", | |
| "updated": "20231028", | |
| "text": chunk, | |
| }, | |
| } | |
| data.append(data_entry) | |
| dataset_dict = { | |
| "id": [item["id"] for item in data], | |
| "text": [item["text"] for item in data], | |
| "metadata": [item["metadata"] for item in data], | |
| } | |
| formatted_dataset = Dataset.from_dict(dataset_dict) | |
| print("Dataset created successfully!") | |
| return formatted_dataset | |
| def embed(docs: list[str]) -> list[list[float]]: | |
| print("Embedding documents...") | |
| res = client.embeddings.create(input=docs, model="text-embedding-3-small") | |
| print("Documents embedded successfully!") | |
| # Assuming the new API response object exposes the embedding directly | |
| return [x.embedding for x in res.data] | |
| def insert_to_pinecone(index, dataset, batch_size=100): | |
| print("Inserting data to Pinecone...") | |
| # Check if the Pinecone index is empty | |
| index_stats = index.describe_index_stats() | |
| if index_stats.total_vector_count > 0: | |
| print("Pinecone index is not empty. No new data will be inserted.") | |
| return | |
| # Fetch existing vector IDs in the index | |
| response = index.fetch(ids=dataset["id"]) | |
| existing_ids = set(response.get("id", [])) | |
| # Filter out the data that is already in the index | |
| new_data = dataset.filter(lambda example: example["id"] not in existing_ids) | |
| if len(new_data) == 0: | |
| print("All data is already present in the Pinecone index.") | |
| return | |
| # Insert the new data in batches | |
| for i in range(0, len(new_data), batch_size): | |
| batch = new_data[i : i + batch_size] | |
| embeds = embed(batch["text"]) | |
| to_upsert = list(zip(batch["id"], embeds, batch["metadata"])) | |
| index.upsert(vectors=to_upsert) | |
| print( | |
| f"Batch {i // batch_size + 1}/{(len(new_data) - 1) // batch_size + 1} inserted." | |
| ) | |
| print("New data inserted to Pinecone successfully!") | |
| def get_docs(index, query: str, top_k: int): | |
| print("Fetching documents from Pinecone...") | |
| xq = embed([query])[0] | |
| res = index.query(xq, top_k=top_k, include_metadata=True) | |
| docs = {x["metadata"]["text"]: i for i, x in enumerate(res.matches)} | |
| print("Documents fetched successfully!") | |
| return docs | |
| def compare(index, co, query, top_k=25, top_n=3): | |
| # Get vec search results | |
| docs = get_docs(index, query, top_k=top_k) | |
| i2doc = {docs[doc]: doc for doc in docs.keys()} | |
| # Re-rank | |
| rerank_docs = co.rerank( | |
| query=query, | |
| documents=list(docs.keys()), | |
| top_n=top_n, | |
| model="rerank-english-v2.0", | |
| ) | |
| comparison_data = [] | |
| # Compare order change | |
| for i, doc in enumerate(rerank_docs): | |
| rerank_i = docs[doc.document["text"]] | |
| comparison_data.append( | |
| { | |
| "Original Rank": i, | |
| "Original Text": i2doc[i], | |
| "Reranked Rank": rerank_i, | |
| "Reranked Text": doc.document["text"], | |
| } | |
| ) | |
| return comparison_data | |
| def evaluate_resumes(index, co, query, top_k=10, rerank_top_n=5): | |
| print("Evaluating resumes...") | |
| docs = get_docs(index, query, top_k=top_k) | |
| if not docs: | |
| print("No documents found.") | |
| return None, "No documents found." | |
| doc_texts = list(docs.keys()) | |
| rerank_response = co.rerank( | |
| query=query, | |
| documents=doc_texts, | |
| top_n=rerank_top_n, | |
| model="rerank-english-v2.0", | |
| ) | |
| rerank_docs = [result.document for result in rerank_response.results] | |
| combined_resumes = "\n\n".join([doc["text"] for doc in rerank_docs]) | |
| prompt = f""" | |
| You are an HR professional with extensive experience in evaluating resumes for various job roles.This is the task you have been assigned. | |
| Task: | |
| {query} | |
| Based on the resumes provided below, your task is to select the top candidates and provide a detailed justification for each selection, highlighting their skills, experience, and overall fit for a general job role. Focus solely on the evaluation and selection process, and ensure your response is clear, concise, and directly related to the task at hand. | |
| --- | |
| Resumes: | |
| {combined_resumes} | |
| --- | |
| Please provide your selections and detailed justifications below: | |
| """ | |
| response = co.generate(prompt=prompt) | |
| if response.generations: | |
| print("Resumes evaluated successfully!") | |
| return response.generations[0].text, None | |
| else: | |
| print("Failed to generate a response.") | |
| return None, "Failed to generate a response." | |
| return None, "Failed to generate a response." | |
| return None, "Failed to generate a response." | |