Spaces:

Sanchayt
/

Cohere-Rerank

Runtime error

App Files Files Community

Sanchayt commited on Oct 29, 2023

Commit

ae266f3

1 Parent(s): 3c6f8cf

Done

Browse files

Files changed (3) hide show

app.py +90 -0
helpers.py +196 -0
requirements.txt +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import cohere
+import openai
+import pandas as pd
+import streamlit as st
+from dotenv import load_dotenv
+import helpers
+load_dotenv()
+# Function to initialize APIs
+def initialize_apis():
+    if "openai_api_key" in st.session_state and "cohere_api_key" in st.session_state:
+        openai.api_key = st.session_state["openai_api_key"]
+        co = cohere.Client(st.session_state["cohere_api_key"])
+        index = helpers.initialize_pinecone(
+            st.session_state["api_key"], st.session_state["env"], "coherererank", 1536
+        )
+        return co, index
+    return None, None
+with st.sidebar:
+    api_key = st.text_input(
+        "Enter Pinecone API key:", value=os.getenv("PINECONE_API_KEY", "")
+    )
+    env = st.text_input(
+        "Enter Pinecone environment:", value=os.getenv("PINECONE_ENVIRONMENT", "")
+    )
+    openai_api_key = st.text_input(
+        "Enter OpenAI API key:", value=os.getenv("OPENAI_API_KEY", "")
+    )
+    cohere_api_key = st.text_input(
+        "Enter Cohere API key:", value=os.getenv("COHERE_API_KEY", "")
+    )
+    if st.button("Submit API Keys"):
+        st.session_state["api_key"] = api_key
+        st.session_state["env"] = env
+        st.session_state["openai_api_key"] = openai_api_key
+        st.session_state["cohere_api_key"] = cohere_api_key
+# Check if API keys are set
+if all(
+    key in st.session_state
+    for key in ["api_key", "env", "openai_api_key", "cohere_api_key"]
+):
+    co, index = initialize_apis()
+    if co and index:
+        query = st.text_input("Enter search query:")
+        top_k = st.number_input(
+            "Top K resumes to fetch:", min_value=1, max_value=50, value=10
+        )
+        rerank_top_n = st.number_input(
+            "Top N resumes to rerank:", min_value=1, max_value=top_k, value=5
+        )
+        if st.button("Search"):
+            if query:
+                with st.spinner("Fetching and evaluating resumes..."):
+                    dataset = helpers.create_dataset()
+                    helpers.insert_to_pinecone(index, dataset)
+                    evaluation, error = helpers.evaluate_resumes(
+                        index, co, query, top_k=top_k, rerank_top_n=rerank_top_n
+                    )
+                    comparison_data = helpers.compare(
+                        index, co, query, top_k=top_k, top_n=rerank_top_n
+                    )
+                if evaluation:
+                    st.markdown("### Evaluation:")
+                    st.markdown(evaluation)
+                    # Display the comparison results
+                    st.markdown("### Original vs Reranked Docs Comparison:")
+                    st.write("---")
+                    df_comparison = pd.DataFrame(comparison_data)
+                    st.table(df_comparison)
+                elif error:
+                    st.warning(error)
+            else:
+                st.warning("Please enter a query.")

helpers.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import random
+import time
+import faker
+import openai
+import pinecone
+import tqdm
+from datasets import Dataset
+fake = faker.Faker()
+index_name = "coherererank"
+dimension = 1536  # Dimensionality of the ada-002 model
+embed_model = "text-embedding-ada-002"
+def initialize_pinecone(api_key, env, index_name, dimension):
+    print("Initializing Pinecone...")
+    pinecone.init(api_key=api_key, environment=env)
+    if index_name not in pinecone.list_indexes():
+        print(f"Creating Pinecone index: {index_name}")
+        pinecone.create_index(index_name, dimension=dimension, metric="dotproduct")
+        while not pinecone.describe_index(index_name).status["ready"]:
+            print("Waiting for index to be ready...")
+            time.sleep(1)
+    index = pinecone.Index(index_name)
+    print("Pinecone initialized successfully!")
+    return index
+def generate_resume():
+    print("Generating a synthetic resume...")
+    resume = {
+        "id": fake.uuid4(),
+        "text": f"{fake.name()}\n{fake.job()}\n{fake.company()}\n{fake.catch_phrase()}\nSkills: {', '.join(fake.words(ext_word_list=None, unique=True))}\nExperience: {fake.bs()} at {fake.company()} for {random.randint(1, 10)} years.",
+        "metadata": {
+            "experience": f"{random.randint(1, 10)} years",
+            "education": random.choice(["Bachelor's", "Master's", "PhD"]),
+        },
+    }
+    print("Synthetic resume generated successfully!")
+    return resume
+def create_dataset(num_resumes=1000, chunk_size=800):
+    print("Creating dataset...")
+    synthetic_resumes = [generate_resume() for _ in range(num_resumes)]
+    data = []
+    for resume in synthetic_resumes:
+        resume_text = resume["text"]
+        text_chunks = [
+            resume_text[i : i + chunk_size]
+            for i in range(0, len(resume_text), chunk_size)
+        ]
+        for idx, chunk in enumerate(text_chunks):
+            chunk_id = f'{resume["id"]}-{idx}'
+            data_entry = {
+                "id": chunk_id,
+                "text": chunk,
+                "metadata": {
+                    "title": "Resume Chunk",
+                    "url": f"https://example.com/resume/{chunk_id}",
+                    "primary_category": "Resume",
+                    "published": "20231028",
+                    "updated": "20231028",
+                    "text": chunk,
+                },
+            }
+            data.append(data_entry)
+    dataset_dict = {
+        "id": [item["id"] for item in data],
+        "text": [item["text"] for item in data],
+        "metadata": [item["metadata"] for item in data],
+    }
+    formatted_dataset = Dataset.from_dict(dataset_dict)
+    print("Dataset created successfully!")
+    return formatted_dataset
+def embed(docs: list[str]) -> list[list[float]]:
+    print("Embedding documents...")
+    res = openai.Embedding.create(input=docs, engine=embed_model)
+    print("Documents embedded successfully!")
+    return [x["embedding"] for x in res["data"]]
+def insert_to_pinecone(index, dataset, batch_size=100):
+    print("Inserting data to Pinecone...")
+    # Check if the Pinecone index is empty
+    index_stats = index.describe_index_stats()
+    if index_stats.total_vector_count > 0:
+        print("Pinecone index is not empty. No new data will be inserted.")
+        return
+    # Fetch existing vector IDs in the index
+    response = index.fetch(ids=dataset["id"])
+    existing_ids = set(response.get("id", []))
+    # Filter out the data that is already in the index
+    new_data = dataset.filter(lambda example: example["id"] not in existing_ids)
+    if len(new_data) == 0:
+        print("All data is already present in the Pinecone index.")
+        return
+    # Insert the new data in batches
+    for i in range(0, len(new_data), batch_size):
+        batch = new_data[i : i + batch_size]
+        embeds = embed(batch["text"])
+        to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
+        index.upsert(vectors=to_upsert)
+        print(
+            f"Batch {i // batch_size + 1}/{(len(new_data) - 1) // batch_size + 1} inserted."
+        )
+    print("New data inserted to Pinecone successfully!")
+def get_docs(index, query: str, top_k: int):
+    print("Fetching documents from Pinecone...")
+    xq = embed([query])[0]
+    res = index.query(xq, top_k=top_k, include_metadata=True)
+    docs = {x["metadata"]["text"]: i for i, x in enumerate(res["matches"])}
+    print("Documents fetched successfully!")
+    return docs
+def compare(index, co, query, top_k=25, top_n=3):
+    # Get vec search results
+    docs = get_docs(index, query, top_k=top_k)
+    i2doc = {docs[doc]: doc for doc in docs.keys()}
+    # Re-rank
+    rerank_docs = co.rerank(
+        query=query,
+        documents=list(docs.keys()),
+        top_n=top_n,
+        model="rerank-english-v2.0",
+    )
+    comparison_data = []
+    # Compare order change
+    for i, doc in enumerate(rerank_docs):
+        rerank_i = docs[doc.document["text"]]
+        comparison_data.append({
+            'Original Rank': i,
+            'Original Text': i2doc[i],
+            'Reranked Rank': rerank_i,
+            'Reranked Text': doc.document['text']
+        })
+    return comparison_data
+def evaluate_resumes(index, co, query, top_k=10, rerank_top_n=5):
+    print("Evaluating resumes...")
+    docs = get_docs(index, query, top_k=top_k)
+    if not docs:
+        print("No documents found.")
+        return None, "No documents found."
+    doc_texts = list(docs.keys())
+    rerank_response = co.rerank(
+        query=query,
+        documents=doc_texts,
+        top_n=rerank_top_n,
+        model="rerank-english-v2.0",
+    )
+    rerank_docs = [result.document for result in rerank_response.results]
+    combined_resumes = "\n\n".join([doc["text"] for doc in rerank_docs])
+    prompt = f"""
+    You are an HR professional with extensive experience in evaluating resumes for various job roles.This is the task you have been assigned.
+    Task:
+    {query}
+    Based on the resumes provided below, your task is to select the top candidates and provide a detailed justification for each selection, highlighting their skills, experience, and overall fit for a general job role. Focus solely on the evaluation and selection process, and ensure your response is clear, concise, and directly related to the task at hand.
+    ---
+    Resumes:
+    {combined_resumes}
+    ---
+    Please provide your selections and detailed justifications below:
+    """
+    response = co.generate(prompt=prompt)
+    if response.generations:
+        print("Resumes evaluated successfully!")
+        return response.generations[0].text, None
+    else:
+        print("Failed to generate a response.")
+        return None, "Failed to generate a response."
+        return None, "Failed to generate a response."
+        return None, "Failed to generate a response."

requirements.txt ADDED Viewed

	@@ -0,0 +1,143 @@

+aiohttp==3.8.6
+aiosignal==1.3.1
+aiostream==0.5.2
+altair==5.1.2
+annotated-types==0.6.0
+anyio==3.7.1
+appnope==0.1.3
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==23.1.0
+backcall==0.2.0
+backoff==2.2.1
+blinker==1.6.3
+cachetools==5.3.2
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.1
+click==8.1.7
+cohere==4.32
+comm==0.1.4
+cryptography==41.0.5
+dataclasses-json==0.5.14
+datasets==2.14.6
+debugpy==1.8.0
+decorator==5.1.1
+Deprecated==1.2.14
+dill==0.3.7
+dnspython==2.4.2
+et-xmlfile==1.1.0
+exceptiongroup==1.1.3
+executing==2.0.0
+Faker==19.12.0
+fastavro==1.8.2
+filelock==3.13.0
+frozenlist==1.4.0
+fsspec==2023.10.0
+fuzzywuzzy==0.18.0
+gitdb==4.0.11
+GitPython==3.1.40
+greenlet==3.0.1
+grpcio==1.59.0
+grpcio-tools==1.59.0
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==0.18.0
+httpx==0.25.0
+huggingface-hub==0.18.0
+hyperframe==6.0.1
+idna==3.4
+importlib-metadata==6.8.0
+ipykernel==6.26.0
+ipython==8.16.1
+jedi==0.19.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+jupyter_client==8.5.0
+jupyter_core==5.4.0
+langchain==0.0.325
+langsmith==0.0.53
+Levenshtein==0.23.0
+llama-index==0.8.53.post3
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+multidict==6.0.4
+multiprocess==0.70.15
+mypy-extensions==1.0.0
+nest-asyncio==1.5.8
+nltk==3.8.1
+numpy==1.26.1
+openai==0.28.1
+openpyxl==3.1.2
+packaging==23.2
+pandas==2.1.2
+parso==0.8.3
+pdfminer.six==20221105
+pdfplumber==0.10.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==10.1.0
+pinecone-client==2.2.4
+platformdirs==3.11.0
+portalocker==2.8.2
+prompt-toolkit==3.0.39
+protobuf==4.24.4
+psutil==5.9.6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==13.0.0
+pycparser==2.21
+pydantic==2.4.2
+pydantic_core==2.10.1
+pydeck==0.8.1b0
+Pygments==2.16.1
+pypdf==3.16.4
+PyPDF2==3.0.1
+pypdfium2==4.22.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-Levenshtein==0.23.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+pyzmq==25.1.1
+qdrant-client==1.6.4
+rapidfuzz==3.4.0
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+rich==13.6.0
+rpds-py==0.10.6
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+SQLAlchemy==2.0.22
+stack-data==0.6.3
+streamlit==1.28.0
+tenacity==8.2.3
+tiktoken==0.5.1
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.12.0
+typing-inspect==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==1.26.18
+validators==0.22.0
+watchdog==3.0.0
+wcwidth==0.2.8
+wrapt==1.15.0
+xxhash==3.4.1
+yarl==1.9.2
+zipp==3.17.0