Spaces:

jonah-ramponi
/

CV_Reviewer

Build error

App Files Files Community

Jonah Ramponi commited on Aug 27, 2024

Commit

22be37d

1 Parent(s): 1c878ea

commiting day1

Browse files

Files changed (8) hide show

.gitignore +128 -0
app.py +126 -0
backend.py +41 -0
requirements.txt +4 -0
utils/__init__.py +0 -0
utils/gpt.py +40 -0
utils/process_doc.py +40 -0
utils/prompts.py +124 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,128 @@

+# Project Specific
+sample_data/
+# Editors
+.vscode/
+.idea/
+# Vagrant
+.vagrant/
+# Mac/OSX
+.DS_Store
+# Windows
+Thumbs.db
+# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+    For HF, the interface should be called app.py
+"""
+import json
+import concurrent.futures
+import streamlit as st
+from utils.process_doc import parse_docx, parse_pdf
+from backend import process_cv, process_job_posting
+from utils.gpt import test_api_key
+st.set_page_config(layout="wide")
+with st.sidebar:
+    COHERE_API_KEY = st.text_input(
+        "Cohere API Key Entry",
+        value="",
+        placeholder="Enter your Free Tier Cohere API Key",
+    )
+if "state" not in st.session_state:
+    st.session_state.state = {"formatted": False}
+STATE = st.session_state.state
+cv_upload_box = st.file_uploader(
+    "CV Upload Box",
+    help="Upload your CV in .docx or .pdf form. This CV will be parsed, and used to analyse against the given job post.",
+    type=["docx", "pdf"],
+    accept_multiple_files=False,
+)
+job_posting_upload_box = st.text_area(
+    "Job Description Upload Box",
+    placeholder="Copy and Paste a job post you are interested in. Make sure to include the full post! More information is better.",
+    help="In this box, please dump text content for a job description you are interested in. This could easily be setup to work directly with a webpage (we'd simply need to scrape said page) however I do not want to do that on HF spaces.",
+)
+if cv_upload_box and job_posting_upload_box != "":
+    process_files = st.button("Process Files", type="primary")
+    if process_files:
+        if test_api_key(COHERE_API_KEY):
+            # Process our two uploaded files into state variables
+            STATE["job_posting"] = job_posting_upload_box
+            cv_filetype = cv_upload_box.name.split(".")[-1]
+            cv_file_contents = cv_upload_box.getvalue()
+            STATE["cv"] = (
+                parse_docx(cv_file_contents)
+                if cv_filetype == "docx"
+                else parse_pdf(cv_file_contents)
+            )
+            # Now, use Cohere to get structured output for both cv and job_posting
+            # Making these calls in parallel
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future1 = executor.submit(process_cv, STATE["cv"], COHERE_API_KEY)
+                future2 = executor.submit(
+                    process_job_posting, STATE["job_posting"], COHERE_API_KEY
+                )
+                cv_json_text = future1.result()
+                job_posting_json_text = future2.result()
+            cv_json_text = (
+                "{" + cv_json_text.lstrip().lstrip("{").rstrip().rstrip("}") + "}"
+            )
+            job_posting_json_text = (
+                "{"
+                + job_posting_json_text.lstrip().lstrip("{").rstrip().rstrip("}")
+                + "}"
+            )
+            try:
+                STATE["cv_json"] = json.loads(cv_json_text)
+            except json.JSONDecodeError as e:
+                print(
+                    f"Error parsing JSON Output for CV: {e}. Response content: {cv_json_text}"
+                )
+                STATE["cv_json"] = {"name": "Failed"}
+            try:
+                STATE["job_posting_json"] = json.loads(job_posting_json_text)
+            except json.JSONDecodeError as e:
+                print(
+                    f"Error parsing JSON Output for Job Posting: {e}. Response content: {job_posting_json_text}"
+                )
+                STATE["job_posting_json"] = {"companyName": "Failed"}
+            STATE["formatted"] = True
+        else:
+            st.error(
+                "You entered an invalid Cohere API Key. Please enter a valid API key in the sidebar."
+            )
+    # Now, we can work with this !
+    if STATE["formatted"]:
+        lcol, rcol = st.columns((0.5, 0.5))
+        with lcol:
+            st.download_button(
+                label="Download Job Posting JSON",
+                data=json.dumps(STATE["job_posting_json"], indent=4),
+                file_name=f"job_posting_formatted_{STATE['job_posting_json']['companyName']}.json",
+                mime="application/json",
+                use_container_width=True,
+            )
+        with rcol:
+            st.download_button(
+                label="Download CV JSON",
+                data=json.dumps(STATE["cv_json"], indent=4),
+                file_name=f"cv_formatted_{STATE['cv_json']['name']}.json",
+                mime="application/json",
+                use_container_width=True,
+            )
+        cv_critique, practice_interview, general_cv_critique = st.tabs(
+            ["Role Specific CV Critique", "Practice Interview", "General CV Critique"]
+        )

backend.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from utils.prompts import (
+    cv_extract_prompt,
+    cv_format,
+    job_posting_extract_prompt,
+    job_posting_format,
+)
+from utils.gpt import gpt_response
+def process_cv(cv_contents: str, API_KEY: str) -> str:
+    """Process CV contents, using Cohere"""
+    prompt = cv_extract_prompt.replace("<input-cv>", cv_contents)
+    response = gpt_response(
+        prompt=prompt,
+        api_key=API_KEY,
+    )
+    return response
+def process_job_posting(job_post_contents: str, API_KEY: str) -> str:
+    """Process a job posting, using Cohere"""
+    prompt = job_posting_extract_prompt.replace("<job-posting>", job_post_contents)
+    response = gpt_response(
+        prompt=prompt,
+        api_key=API_KEY,
+    )
+    return response
+if __name__ == "__main__":
+    with open("sample_data/meta_job.txt", "r") as file:
+        post_contents = file.read()
+    output = process_job_posting(post_contents)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit==1.37.0
+cohere==5.5.7
+pymupdf==1.24.9
+python-docx==1.1.2

utils/__init__.py ADDED Viewed

File without changes

utils/gpt.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import cohere
+def test_api_key(api_key: str):
+    try:
+        # try to just generate 3 tokens
+        co = cohere.Client(
+            api_key=api_key,
+        )
+        response = co.generate(prompt="sample prompt", max_tokens=3)
+        return True
+    except:
+        return False
+def gpt_stream_response(prompt: str, api_key: str):
+    """Get response from Cohere and stream response"""
+    co = cohere.Client(
+        api_key=api_key,
+    )
+    stream = co.chat_stream(message=prompt)
+    for event in stream:
+        if event.event_type == "text-generation":
+            yield event.text
+def gpt_response(prompt: str, api_key: str) -> str:
+    """Get response from Cohere, with option to get output in json format"""
+    co = cohere.Client(
+        api_key=api_key,
+    )
+    response = co.chat(message=prompt)
+    return response.text

utils/process_doc.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Script for processing an input CV document
+"""
+import io
+import fitz
+from docx import Document
+def parse_pdf(pdf_file) -> str:
+    """Read PDF from Streamlit's file uploader"""
+    pdf_document = fitz.open("pdf", pdf_file)
+    all_text = []
+    for page_number in range(len(pdf_document)):
+        page = pdf_document.load_page(page_number)
+        all_text.append(page.get_text("text"))
+    pdf_document.close()
+    return "\n\n".join(all_text)
+def parse_docx(docx_file):
+    """Read in docx file"""
+    docx_file = io.BytesIO(docx_file)
+    document = Document(docx_file)
+    all_text = []
+    for paragraph in document.paragraphs:
+        all_text.append(paragraph.text)
+    return "\n".join(all_text)

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# NOTE: The _format variables may be useful for testing the beta of Cohere's structured output endpoints.
+cv_format = {
+    "type": "object",
+    "required": [
+        "name",
+        "roles",
+        "projects",
+        "languages",
+        "packages",
+        "tools",
+        "qualifications",
+        "responsibilities",
+    ],
+    "properties": {
+        "name": {"type": "string"},
+        "roles": {"type": "list"},
+        "projects": {"type": "list"},
+        "languages": {"type": "list"},
+        "packages": {"type": "list"},
+        "tools": {"type": "list"},
+        "qualifications": {"type": "list"},
+        "responsibilities": {"type": "list"},
+    },
+}
+cv_extract_prompt = """
+You are an expert at extracting information from CVs
+**Goal**
+For a given CV, you must extract structured information and present it to the user in json form
+**Input**
+<input-cv>
+**Output Format**
+You will respond with a json object, in the form given.
+You will ensure that you are concise.
+{
+    "name": ,
+    "roles": [],
+    "projects": [],
+    "languages": [],
+    "packages": [],
+    "tools": [],
+    "qualifications": [] ,
+    "responsibilities": [],
+}
+**Guidance**
+- languages: programming languages mentioned in the CV. Each should be tagged with a number between 1 and 5. 5 suggests they must be fully fluent, 1 suggests they may have some experience. If the CV does not indicate the level of required experience, approximate it, and if no information at all is given put 3
+- packages: specific packages mentioned in the CV. Each package should be tagged with a number between 1 and 5, 5 suggesting fully fluent and 1 suggesting just tried it once. Use your best judgement to gauge the individual's level of understanding.
+- tools: a list of other tools the individual has experience with
+- qualifications: of form [{"type": , "grade": ,"location": }] where type is the qualification type identified. Available Education Levels are: bsc,msc,phd. grade should be the grade achieved (number between 0 and 100. Make relevant conversions, if no number is given, assume 60). location is the location of where the education was taken.
+- responsibilities: an extensive list of the responsibilities demonstrated in the CV.
+You will now respond clearly, only responding with the desired json output.
+"""
+job_posting_format = {
+    "type": "object",
+    "required": [
+        "companyName",
+        "roleShortDesc",
+        "roleLongDesc",
+        "requiredExperience",
+        "languages",
+        "packages",
+        "tools",
+        "qualifications",
+        "responsibilities",
+    ],
+    "properties": {
+        "name": {"type": "string"},
+        "roleShortDesc": {"type": "string"},
+        "roleLongDesc": {"type": "string"},
+        "requiredExperience": {"type": "list"},
+        "languages": {"type": "list"},
+        "packages": {"type": "list"},
+        "tools": {"type": "list"},
+        "qualifications": {"type": "list"},
+        "responsibilities": {"type": "list"},
+    },
+}
+job_posting_extract_prompt = """
+You are an expert at extracting information from job postings
+**Goal**
+For a given job posting, you must extract structured information and present it to the user
+**Input**
+<job-posting>
+**Output Format**
+You will respond with a json object, in the form given.
+You will ensure that you are concise.
+{
+    "companyName": ,
+    "roleShortDesc": ,
+    "roleLongDesc": ,
+    "requiredExperience": [],
+    "languages": [],
+    "packages": [],
+    "tools": [],
+    "qualifications": [] ,
+    "responsibilities": [],
+}
+**Guidance**
+- roleShortDesc should be one sentence only
+- requiredExperience: a standaredized list of items, each item should be at most one short sentence
+- languages: programming languages mentioned in the post. Each should be tagged with a number between 1 and 5. 5 suggests they must be fully fluent, 1 suggests they may have some experience. If the job does not indicate the level of required experience, put 3
+- packages: specific packages mentioned in the post. Each package should be tagged with a number between 1 and 5, 5 suggesting fully fluent and 1 suggesting just tried it once. If no level is given, assume level 3
+- tools: a list of other tools that would be useful to know
+- qualifications: of form [{"type": , "strictness": }] where type is the qualification type identified, and strictness is your approximation of how strict the job posting suggests the requirement is. 1 being not strict at all, 5 being absolute necessity. Available Education Levels are: bsc,msc,phd.
+- responsibilities: an extensive list of the responsibilities given in the advert.
+You will now respond clearly, only responding with the desired json output.
+"""