Sanchayt commited on
Commit
ae266f3
·
1 Parent(s): 3c6f8cf
Files changed (3) hide show
  1. app.py +90 -0
  2. helpers.py +196 -0
  3. requirements.txt +143 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import cohere
4
+ import openai
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from dotenv import load_dotenv
8
+
9
+ import helpers
10
+
11
+ load_dotenv()
12
+
13
+
14
+ # Function to initialize APIs
15
+ def initialize_apis():
16
+ if "openai_api_key" in st.session_state and "cohere_api_key" in st.session_state:
17
+ openai.api_key = st.session_state["openai_api_key"]
18
+ co = cohere.Client(st.session_state["cohere_api_key"])
19
+ index = helpers.initialize_pinecone(
20
+ st.session_state["api_key"], st.session_state["env"], "coherererank", 1536
21
+ )
22
+ return co, index
23
+ return None, None
24
+
25
+
26
+ with st.sidebar:
27
+ api_key = st.text_input(
28
+ "Enter Pinecone API key:", value=os.getenv("PINECONE_API_KEY", "")
29
+ )
30
+ env = st.text_input(
31
+ "Enter Pinecone environment:", value=os.getenv("PINECONE_ENVIRONMENT", "")
32
+ )
33
+ openai_api_key = st.text_input(
34
+ "Enter OpenAI API key:", value=os.getenv("OPENAI_API_KEY", "")
35
+ )
36
+ cohere_api_key = st.text_input(
37
+ "Enter Cohere API key:", value=os.getenv("COHERE_API_KEY", "")
38
+ )
39
+
40
+ if st.button("Submit API Keys"):
41
+ st.session_state["api_key"] = api_key
42
+ st.session_state["env"] = env
43
+ st.session_state["openai_api_key"] = openai_api_key
44
+ st.session_state["cohere_api_key"] = cohere_api_key
45
+
46
+ # Check if API keys are set
47
+ if all(
48
+ key in st.session_state
49
+ for key in ["api_key", "env", "openai_api_key", "cohere_api_key"]
50
+ ):
51
+ co, index = initialize_apis()
52
+ if co and index:
53
+ query = st.text_input("Enter search query:")
54
+ top_k = st.number_input(
55
+ "Top K resumes to fetch:", min_value=1, max_value=50, value=10
56
+ )
57
+ rerank_top_n = st.number_input(
58
+ "Top N resumes to rerank:", min_value=1, max_value=top_k, value=5
59
+ )
60
+
61
+ if st.button("Search"):
62
+ if query:
63
+ with st.spinner("Fetching and evaluating resumes..."):
64
+ dataset = helpers.create_dataset()
65
+ helpers.insert_to_pinecone(index, dataset)
66
+ evaluation, error = helpers.evaluate_resumes(
67
+ index, co, query, top_k=top_k, rerank_top_n=rerank_top_n
68
+ )
69
+
70
+ comparison_data = helpers.compare(
71
+ index, co, query, top_k=top_k, top_n=rerank_top_n
72
+ )
73
+
74
+ if evaluation:
75
+ st.markdown("### Evaluation:")
76
+ st.markdown(evaluation)
77
+
78
+ # Display the comparison results
79
+ st.markdown("### Original vs Reranked Docs Comparison:")
80
+ st.write("---")
81
+
82
+ df_comparison = pd.DataFrame(comparison_data)
83
+ st.table(df_comparison)
84
+
85
+ elif error:
86
+ st.warning(error)
87
+ else:
88
+ st.warning("Please enter a query.")
89
+
90
+
helpers.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import time
3
+
4
+ import faker
5
+ import openai
6
+ import pinecone
7
+ import tqdm
8
+ from datasets import Dataset
9
+
10
+ fake = faker.Faker()
11
+
12
+ index_name = "coherererank"
13
+ dimension = 1536 # Dimensionality of the ada-002 model
14
+ embed_model = "text-embedding-ada-002"
15
+
16
+
17
+ def initialize_pinecone(api_key, env, index_name, dimension):
18
+ print("Initializing Pinecone...")
19
+ pinecone.init(api_key=api_key, environment=env)
20
+ if index_name not in pinecone.list_indexes():
21
+ print(f"Creating Pinecone index: {index_name}")
22
+ pinecone.create_index(index_name, dimension=dimension, metric="dotproduct")
23
+ while not pinecone.describe_index(index_name).status["ready"]:
24
+ print("Waiting for index to be ready...")
25
+ time.sleep(1)
26
+ index = pinecone.Index(index_name)
27
+ print("Pinecone initialized successfully!")
28
+ return index
29
+
30
+
31
+ def generate_resume():
32
+ print("Generating a synthetic resume...")
33
+ resume = {
34
+ "id": fake.uuid4(),
35
+ "text": f"{fake.name()}\n{fake.job()}\n{fake.company()}\n{fake.catch_phrase()}\nSkills: {', '.join(fake.words(ext_word_list=None, unique=True))}\nExperience: {fake.bs()} at {fake.company()} for {random.randint(1, 10)} years.",
36
+ "metadata": {
37
+ "experience": f"{random.randint(1, 10)} years",
38
+ "education": random.choice(["Bachelor's", "Master's", "PhD"]),
39
+ },
40
+ }
41
+ print("Synthetic resume generated successfully!")
42
+ return resume
43
+
44
+
45
+ def create_dataset(num_resumes=1000, chunk_size=800):
46
+ print("Creating dataset...")
47
+ synthetic_resumes = [generate_resume() for _ in range(num_resumes)]
48
+ data = []
49
+ for resume in synthetic_resumes:
50
+ resume_text = resume["text"]
51
+ text_chunks = [
52
+ resume_text[i : i + chunk_size]
53
+ for i in range(0, len(resume_text), chunk_size)
54
+ ]
55
+ for idx, chunk in enumerate(text_chunks):
56
+ chunk_id = f'{resume["id"]}-{idx}'
57
+ data_entry = {
58
+ "id": chunk_id,
59
+ "text": chunk,
60
+ "metadata": {
61
+ "title": "Resume Chunk",
62
+ "url": f"https://example.com/resume/{chunk_id}",
63
+ "primary_category": "Resume",
64
+ "published": "20231028",
65
+ "updated": "20231028",
66
+ "text": chunk,
67
+ },
68
+ }
69
+ data.append(data_entry)
70
+ dataset_dict = {
71
+ "id": [item["id"] for item in data],
72
+ "text": [item["text"] for item in data],
73
+ "metadata": [item["metadata"] for item in data],
74
+ }
75
+ formatted_dataset = Dataset.from_dict(dataset_dict)
76
+ print("Dataset created successfully!")
77
+ return formatted_dataset
78
+
79
+
80
+ def embed(docs: list[str]) -> list[list[float]]:
81
+ print("Embedding documents...")
82
+ res = openai.Embedding.create(input=docs, engine=embed_model)
83
+ print("Documents embedded successfully!")
84
+ return [x["embedding"] for x in res["data"]]
85
+
86
+
87
+
88
+ def insert_to_pinecone(index, dataset, batch_size=100):
89
+ print("Inserting data to Pinecone...")
90
+
91
+ # Check if the Pinecone index is empty
92
+ index_stats = index.describe_index_stats()
93
+ if index_stats.total_vector_count > 0:
94
+ print("Pinecone index is not empty. No new data will be inserted.")
95
+ return
96
+
97
+ # Fetch existing vector IDs in the index
98
+ response = index.fetch(ids=dataset["id"])
99
+ existing_ids = set(response.get("id", []))
100
+
101
+ # Filter out the data that is already in the index
102
+ new_data = dataset.filter(lambda example: example["id"] not in existing_ids)
103
+
104
+ if len(new_data) == 0:
105
+ print("All data is already present in the Pinecone index.")
106
+ return
107
+
108
+ # Insert the new data in batches
109
+ for i in range(0, len(new_data), batch_size):
110
+ batch = new_data[i : i + batch_size]
111
+ embeds = embed(batch["text"])
112
+ to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
113
+ index.upsert(vectors=to_upsert)
114
+ print(
115
+ f"Batch {i // batch_size + 1}/{(len(new_data) - 1) // batch_size + 1} inserted."
116
+ )
117
+
118
+ print("New data inserted to Pinecone successfully!")
119
+
120
+
121
+ def get_docs(index, query: str, top_k: int):
122
+ print("Fetching documents from Pinecone...")
123
+ xq = embed([query])[0]
124
+ res = index.query(xq, top_k=top_k, include_metadata=True)
125
+ docs = {x["metadata"]["text"]: i for i, x in enumerate(res["matches"])}
126
+ print("Documents fetched successfully!")
127
+ return docs
128
+
129
+
130
+ def compare(index, co, query, top_k=25, top_n=3):
131
+ # Get vec search results
132
+ docs = get_docs(index, query, top_k=top_k)
133
+ i2doc = {docs[doc]: doc for doc in docs.keys()}
134
+
135
+ # Re-rank
136
+ rerank_docs = co.rerank(
137
+ query=query,
138
+ documents=list(docs.keys()),
139
+ top_n=top_n,
140
+ model="rerank-english-v2.0",
141
+ )
142
+
143
+ comparison_data = []
144
+ # Compare order change
145
+ for i, doc in enumerate(rerank_docs):
146
+ rerank_i = docs[doc.document["text"]]
147
+
148
+ comparison_data.append({
149
+ 'Original Rank': i,
150
+ 'Original Text': i2doc[i],
151
+ 'Reranked Rank': rerank_i,
152
+ 'Reranked Text': doc.document['text']
153
+ })
154
+ return comparison_data
155
+
156
+
157
+ def evaluate_resumes(index, co, query, top_k=10, rerank_top_n=5):
158
+ print("Evaluating resumes...")
159
+ docs = get_docs(index, query, top_k=top_k)
160
+ if not docs:
161
+ print("No documents found.")
162
+ return None, "No documents found."
163
+ doc_texts = list(docs.keys())
164
+ rerank_response = co.rerank(
165
+ query=query,
166
+ documents=doc_texts,
167
+ top_n=rerank_top_n,
168
+ model="rerank-english-v2.0",
169
+ )
170
+ rerank_docs = [result.document for result in rerank_response.results]
171
+ combined_resumes = "\n\n".join([doc["text"] for doc in rerank_docs])
172
+
173
+ prompt = f"""
174
+ You are an HR professional with extensive experience in evaluating resumes for various job roles.This is the task you have been assigned.
175
+ Task:
176
+ {query}
177
+ Based on the resumes provided below, your task is to select the top candidates and provide a detailed justification for each selection, highlighting their skills, experience, and overall fit for a general job role. Focus solely on the evaluation and selection process, and ensure your response is clear, concise, and directly related to the task at hand.
178
+
179
+ ---
180
+
181
+ Resumes:
182
+ {combined_resumes}
183
+
184
+ ---
185
+
186
+ Please provide your selections and detailed justifications below:
187
+ """
188
+ response = co.generate(prompt=prompt)
189
+ if response.generations:
190
+ print("Resumes evaluated successfully!")
191
+ return response.generations[0].text, None
192
+ else:
193
+ print("Failed to generate a response.")
194
+ return None, "Failed to generate a response."
195
+ return None, "Failed to generate a response."
196
+ return None, "Failed to generate a response."
requirements.txt ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.6
2
+ aiosignal==1.3.1
3
+ aiostream==0.5.2
4
+ altair==5.1.2
5
+ annotated-types==0.6.0
6
+ anyio==3.7.1
7
+ appnope==0.1.3
8
+ asttokens==2.4.1
9
+ async-timeout==4.0.3
10
+ attrs==23.1.0
11
+ backcall==0.2.0
12
+ backoff==2.2.1
13
+ blinker==1.6.3
14
+ cachetools==5.3.2
15
+ certifi==2023.7.22
16
+ cffi==1.16.0
17
+ charset-normalizer==3.3.1
18
+ click==8.1.7
19
+ cohere==4.32
20
+ comm==0.1.4
21
+ cryptography==41.0.5
22
+ dataclasses-json==0.5.14
23
+ datasets==2.14.6
24
+ debugpy==1.8.0
25
+ decorator==5.1.1
26
+ Deprecated==1.2.14
27
+ dill==0.3.7
28
+ dnspython==2.4.2
29
+ et-xmlfile==1.1.0
30
+ exceptiongroup==1.1.3
31
+ executing==2.0.0
32
+ Faker==19.12.0
33
+ fastavro==1.8.2
34
+ filelock==3.13.0
35
+ frozenlist==1.4.0
36
+ fsspec==2023.10.0
37
+ fuzzywuzzy==0.18.0
38
+ gitdb==4.0.11
39
+ GitPython==3.1.40
40
+ greenlet==3.0.1
41
+ grpcio==1.59.0
42
+ grpcio-tools==1.59.0
43
+ h11==0.14.0
44
+ h2==4.1.0
45
+ hpack==4.0.0
46
+ httpcore==0.18.0
47
+ httpx==0.25.0
48
+ huggingface-hub==0.18.0
49
+ hyperframe==6.0.1
50
+ idna==3.4
51
+ importlib-metadata==6.8.0
52
+ ipykernel==6.26.0
53
+ ipython==8.16.1
54
+ jedi==0.19.1
55
+ Jinja2==3.1.2
56
+ joblib==1.3.2
57
+ jsonpatch==1.33
58
+ jsonpointer==2.4
59
+ jsonschema==4.19.1
60
+ jsonschema-specifications==2023.7.1
61
+ jupyter_client==8.5.0
62
+ jupyter_core==5.4.0
63
+ langchain==0.0.325
64
+ langsmith==0.0.53
65
+ Levenshtein==0.23.0
66
+ llama-index==0.8.53.post3
67
+ loguru==0.7.2
68
+ markdown-it-py==3.0.0
69
+ MarkupSafe==2.1.3
70
+ marshmallow==3.20.1
71
+ matplotlib-inline==0.1.6
72
+ mdurl==0.1.2
73
+ multidict==6.0.4
74
+ multiprocess==0.70.15
75
+ mypy-extensions==1.0.0
76
+ nest-asyncio==1.5.8
77
+ nltk==3.8.1
78
+ numpy==1.26.1
79
+ openai==0.28.1
80
+ openpyxl==3.1.2
81
+ packaging==23.2
82
+ pandas==2.1.2
83
+ parso==0.8.3
84
+ pdfminer.six==20221105
85
+ pdfplumber==0.10.3
86
+ pexpect==4.8.0
87
+ pickleshare==0.7.5
88
+ Pillow==10.1.0
89
+ pinecone-client==2.2.4
90
+ platformdirs==3.11.0
91
+ portalocker==2.8.2
92
+ prompt-toolkit==3.0.39
93
+ protobuf==4.24.4
94
+ psutil==5.9.6
95
+ ptyprocess==0.7.0
96
+ pure-eval==0.2.2
97
+ pyarrow==13.0.0
98
+ pycparser==2.21
99
+ pydantic==2.4.2
100
+ pydantic_core==2.10.1
101
+ pydeck==0.8.1b0
102
+ Pygments==2.16.1
103
+ pypdf==3.16.4
104
+ PyPDF2==3.0.1
105
+ pypdfium2==4.22.0
106
+ python-dateutil==2.8.2
107
+ python-dotenv==1.0.0
108
+ python-Levenshtein==0.23.0
109
+ pytz==2023.3.post1
110
+ PyYAML==6.0.1
111
+ pyzmq==25.1.1
112
+ qdrant-client==1.6.4
113
+ rapidfuzz==3.4.0
114
+ referencing==0.30.2
115
+ regex==2023.10.3
116
+ requests==2.31.0
117
+ rich==13.6.0
118
+ rpds-py==0.10.6
119
+ six==1.16.0
120
+ smmap==5.0.1
121
+ sniffio==1.3.0
122
+ SQLAlchemy==2.0.22
123
+ stack-data==0.6.3
124
+ streamlit==1.28.0
125
+ tenacity==8.2.3
126
+ tiktoken==0.5.1
127
+ toml==0.10.2
128
+ toolz==0.12.0
129
+ tornado==6.3.3
130
+ tqdm==4.66.1
131
+ traitlets==5.12.0
132
+ typing-inspect==0.9.0
133
+ typing_extensions==4.8.0
134
+ tzdata==2023.3
135
+ tzlocal==5.2
136
+ urllib3==1.26.18
137
+ validators==0.22.0
138
+ watchdog==3.0.0
139
+ wcwidth==0.2.8
140
+ wrapt==1.15.0
141
+ xxhash==3.4.1
142
+ yarl==1.9.2
143
+ zipp==3.17.0