Upload 9 files
Browse files- src/FitCV.code-workspace +8 -0
- src/app.py +73 -0
- src/assets/logo.png +0 -0
- src/environment.yml +12 -0
- src/github/RepoParse.py +50 -0
- src/github/languages.json +0 -0
- src/github/relevance_filter.py +68 -0
- src/resume/CVParse.py +24 -0
- src/score.py +14 -0
src/FitCV.code-workspace
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"folders": [
|
| 3 |
+
{
|
| 4 |
+
"path": ".."
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"settings": {}
|
| 8 |
+
}
|
src/app.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
|
| 4 |
+
# === App Configuration ===
|
| 5 |
+
st.set_page_config(
|
| 6 |
+
page_title="FitCV - AI Resume Optimizer",
|
| 7 |
+
page_icon="📄",
|
| 8 |
+
layout="wide"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# === Header Section ===
|
| 12 |
+
col1, col2 = st.columns([1, 6])
|
| 13 |
+
|
| 14 |
+
with col1:
|
| 15 |
+
st.image("assets/logo.png", width=40) # Replace with your own logo
|
| 16 |
+
|
| 17 |
+
with col2:
|
| 18 |
+
st.markdown(
|
| 19 |
+
"<h1 style='font-size: 32px; display: inline;'>FitCV</h1><br>"
|
| 20 |
+
"<span style='font-size:16px; color: gray;'>Your AI-Powered Resume Optimizer</span>",
|
| 21 |
+
unsafe_allow_html=True
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
st.markdown("---")
|
| 25 |
+
|
| 26 |
+
# === Input Section ===
|
| 27 |
+
st.header("🔍 Optimize Your Resume for the Job You Want")
|
| 28 |
+
|
| 29 |
+
uploaded_cv = st.file_uploader("📄 Upload Your CV (PDF only)", type=["pdf"])
|
| 30 |
+
|
| 31 |
+
github_username = st.text_input("🐙 Enter Your GitHub Username")
|
| 32 |
+
|
| 33 |
+
job_mode = st.radio("What job info do you have?", ["Job Title", "Full Job Posting"])
|
| 34 |
+
|
| 35 |
+
if job_mode == "Job Title":
|
| 36 |
+
job_input = st.text_input("💼 Enter the Job Title (e.g., Data Scientist at Google)")
|
| 37 |
+
else:
|
| 38 |
+
job_input = st.text_area("💬 Paste the Full Job Posting")
|
| 39 |
+
|
| 40 |
+
# === Button ===
|
| 41 |
+
if st.button("✨ Optimize Resume"):
|
| 42 |
+
if not uploaded_cv or not github_username or not job_input.strip():
|
| 43 |
+
st.warning("Please fill in all fields and upload a valid resume.")
|
| 44 |
+
else:
|
| 45 |
+
with st.spinner("Analyzing your GitHub, resume, and job requirements..."):
|
| 46 |
+
|
| 47 |
+
# === Placeholder: Call to your main pipeline logic ===
|
| 48 |
+
from src.pipeline.recommend import refine_resume # Adjust path accordingly
|
| 49 |
+
|
| 50 |
+
# Simulate reading bytes
|
| 51 |
+
resume_bytes = uploaded_cv.read()
|
| 52 |
+
|
| 53 |
+
result = refine_resume(
|
| 54 |
+
resume_pdf_bytes=resume_bytes,
|
| 55 |
+
github_username=github_username,
|
| 56 |
+
job_input=job_input
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# === Output Section ===
|
| 60 |
+
st.success("✅ Resume optimization complete!")
|
| 61 |
+
|
| 62 |
+
st.subheader("📝 Refined Professional Summary")
|
| 63 |
+
st.write(result["summary"])
|
| 64 |
+
|
| 65 |
+
st.subheader("🧠 Recommended Skills for This Role")
|
| 66 |
+
st.write(", ".join(result["skills"]))
|
| 67 |
+
|
| 68 |
+
st.subheader("🚀 GitHub Projects to Highlight")
|
| 69 |
+
for proj in result["projects"]:
|
| 70 |
+
st.markdown(f"- **[{proj['name']}]({proj['url']})**: {proj['description']}")
|
| 71 |
+
|
| 72 |
+
st.markdown("---")
|
| 73 |
+
st.info("These suggestions are tailored to the job you provided. You can now update your CV accordingly!")
|
src/assets/logo.png
ADDED
|
src/environment.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: huggingface-env
|
| 2 |
+
dependencies:
|
| 3 |
+
- python=3.11.2
|
| 4 |
+
- pip
|
| 5 |
+
- pip:
|
| 6 |
+
- spacy
|
| 7 |
+
- spacy-layoutlm
|
| 8 |
+
- transformers
|
| 9 |
+
- streamlit
|
| 10 |
+
- torch
|
| 11 |
+
- datasets
|
| 12 |
+
- azureml-defaults
|
src/github/RepoParse.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
with open("/content/languages.json") as f:
|
| 7 |
+
languages = json.load(f)
|
| 8 |
+
|
| 9 |
+
token = "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"
|
| 10 |
+
headers = {"Authorization": f"token ghp_{token}"}
|
| 11 |
+
|
| 12 |
+
res = requests.get("https://api.github.com/users/Eben113/repos", headers=headers)
|
| 13 |
+
|
| 14 |
+
js1 = res.json()[0]
|
| 15 |
+
url = js1["url"] + "/contents"
|
| 16 |
+
res1 = requests.get(url)
|
| 17 |
+
def buildLis():
|
| 18 |
+
lis = [{"name": dict_["name"], "url": dict_["url"], "langURL": dict_["languages_url"]} for dict_ in res.json()]
|
| 19 |
+
return lis
|
| 20 |
+
|
| 21 |
+
def scanJson(name, url, langURL):
|
| 22 |
+
extensions = []
|
| 23 |
+
for language in requests.get(langURL).json():
|
| 24 |
+
exts = languages.get(language.title(), None)
|
| 25 |
+
if exts:
|
| 26 |
+
extensions.extend(exts["extensions"])
|
| 27 |
+
else:
|
| 28 |
+
extensions.append("."+language)
|
| 29 |
+
files = requests.get(url + "/contents").json()
|
| 30 |
+
def walk(js, prefix):
|
| 31 |
+
res = {}
|
| 32 |
+
for branch in js:
|
| 33 |
+
if branch["name"] == "README.md":
|
| 34 |
+
res[prefix + "/" + "readme"] = branch["url"]
|
| 35 |
+
elif branch["type"] == "file" and (("."+branch["name"].split(".")[-1]) in extensions):
|
| 36 |
+
res[prefix + "/" + branch["name"]] = branch["url"]
|
| 37 |
+
elif branch["type"] == "dir":
|
| 38 |
+
res.update(walk(requests.get(branch["url"]).json(), prefix + "/" + branch["name"]))
|
| 39 |
+
return res
|
| 40 |
+
info = walk(files, name)
|
| 41 |
+
return info
|
| 42 |
+
|
| 43 |
+
def buildDataset(repo_list):
|
| 44 |
+
data = []
|
| 45 |
+
for repo in repo_list:
|
| 46 |
+
records = scanJson(repo["name"], repo["url"], repo["langURL"])
|
| 47 |
+
for dir, url in records.items():
|
| 48 |
+
data.append({"repo": repo["name"], "directory": dir, "url": url})
|
| 49 |
+
data = pd.DataFrame(data)
|
| 50 |
+
return data
|
src/github/languages.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/github/relevance_filter.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from datasets import load_dataset, Dataset
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import base64
|
| 5 |
+
token = "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"
|
| 6 |
+
headers = {"Authorization": f"token ghp_{token}"}
|
| 7 |
+
|
| 8 |
+
def parse_repo(df, tokenizer, gen_pipeline, role):
|
| 9 |
+
def return_role_prompts(role, task, files, tokenize=False):
|
| 10 |
+
task_prompt = {"describe" : "return a one paragraph project description that shows how this project displays skills that make me good for this job role or posting:", "skill" : "return a list of skills that were showcased in this project and that recruiters look for when looking people for this job role or posting in this format 1. :"}
|
| 11 |
+
assistant_prompt = {"describe":"Suggested project title: ", "skill":"1. "}
|
| 12 |
+
prompts = tokenizer.apply_chat_template([{"role":"system", "content":"You are a CV coach with expertise in editing CVs for people who work in tech"},
|
| 13 |
+
{"role":"user", "content":"These are some (or all) of the files for a particular project I worked on (written in the format Directory:File_1_directory||Content:File_1_content, Directory:File_2_directory||Content:File_2_content,...):"+files+ task_prompt[task] +role+" and would be a good thing to put on my CV"},
|
| 14 |
+
{"role":"assistant", "content":assistant_prompt[task]}], tokenize = tokenize, continue_final_message=True)
|
| 15 |
+
return prompts
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def chunk(text, max_length):
|
| 19 |
+
chunks = []
|
| 20 |
+
tokens = tokenizer.encode(text)
|
| 21 |
+
current = 0
|
| 22 |
+
for i in range(len(tokens)//max_length):
|
| 23 |
+
chunks.append(tokenizer.decode(tokens[current : current+max_length]))
|
| 24 |
+
current += max_length
|
| 25 |
+
chunks.append(tokenizer.decode(tokens[current:]))
|
| 26 |
+
return chunks
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def collate(df, role):
|
| 30 |
+
projects = set(df.repo)
|
| 31 |
+
res = []
|
| 32 |
+
for project in projects:
|
| 33 |
+
current_files = ""
|
| 34 |
+
skills, describe = [], []
|
| 35 |
+
for i, row in df[df["repo"] == project].iterrows():
|
| 36 |
+
if len(return_role_prompts(role, "skill", current_files, True)) >= 7000:
|
| 37 |
+
skills.extend([return_role_prompts(role, "skill", string, False) for string in chunk(current_files, 2500)])
|
| 38 |
+
describe.extend([return_role_prompts(role, "describe", string, False) for string in chunk(current_files, 2500)])
|
| 39 |
+
current_files = ""
|
| 40 |
+
doc = requests.get(row["url"], headers=headers).json()
|
| 41 |
+
doc = base64.b64decode(doc["content"])
|
| 42 |
+
new = "Directory:{}||Content:{}".format(row["directory"], doc.decode("utf-8"))
|
| 43 |
+
if len(return_role_prompts(role, "skill", current_files+new, True)) <= 2500:
|
| 44 |
+
current_files += new
|
| 45 |
+
else:
|
| 46 |
+
skills.append(return_role_prompts(role, "skill", current_files, False))
|
| 47 |
+
describe.append(return_role_prompts(role, "describe", current_files, False))
|
| 48 |
+
current_files = new
|
| 49 |
+
res.append({"repo":project, "description":describe, "skills":skills})
|
| 50 |
+
return pd.DataFrame(res)
|
| 51 |
+
# collate(firstJs, "Machine Learning Engineer")
|
| 52 |
+
|
| 53 |
+
"""def truncate(prompt_rec):
|
| 54 |
+
des_index = prompt_rec["description"].find(prompt_rec["cut1"]) + len(prompt_rec["cut1"]) - 1
|
| 55 |
+
ski_index = prompt_rec["skills"].find(prompt_rec["cut2"]) + len(prompt_rec["cut2"]) - 1
|
| 56 |
+
return {'description': prompt_rec["description"][des_index:], 'skills':prompt_rec["skills"][ski_index:]}
|
| 57 |
+
|
| 58 |
+
prompts = df.apply(func, axis=1, result_type="expand")
|
| 59 |
+
prompts.columns = ("repo", "prompt1", "prompt2", "cut1", "cut2")
|
| 60 |
+
prompts = Dataset.from_pandas(prompts)
|
| 61 |
+
descriptions = gen_pipeline(prompts["prompt1"], max_new_tokens=100)
|
| 62 |
+
skills = gen_pipeline(prompts["prompt2"], max_new_tokens=50)
|
| 63 |
+
res = Dataset.from_dict({"repo": prompts["repo"], "description":descriptions, "skills":skills, "cut1":prompts["cut1"], "cut2":prompts["cut2"]})
|
| 64 |
+
res = res.map(lambda x: {'description': x['description'][0]['generated_text'],
|
| 65 |
+
'skills': x['skills'][0]['generated_text']})
|
| 66 |
+
res = res.map(truncate)
|
| 67 |
+
return res"""
|
| 68 |
+
collate(df, role)
|
src/resume/CVParse.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
from spacy_layout import spaCyLayout
|
| 3 |
+
|
| 4 |
+
nlp = spacy.load("en_core_web_md")
|
| 5 |
+
layout = spaCyLayout(nlp)
|
| 6 |
+
|
| 7 |
+
doc = layout("/content/EbenezerMLEResume (1).pdf")
|
| 8 |
+
|
| 9 |
+
[i for i in doc.spans["layout"] if i.label_ == "section_header"]
|
| 10 |
+
|
| 11 |
+
def extract(string, doc):
|
| 12 |
+
headings = [i for i in doc.spans["layout"] if i.label_ == "section_header"]
|
| 13 |
+
res, match_ = [], headings[0]
|
| 14 |
+
token = nlp(string)
|
| 15 |
+
print(match_.similarity(token))
|
| 16 |
+
for heading in headings[1:]:
|
| 17 |
+
print(heading.similarity(token))
|
| 18 |
+
if heading.similarity(token) > match_.similarity(token):
|
| 19 |
+
match_ = heading
|
| 20 |
+
ind = headings.index(match_)
|
| 21 |
+
start, stop = match_.id, headings[ind+1].id
|
| 22 |
+
for j in range(start+1, stop):
|
| 23 |
+
res.append(doc.spans["layout"][j])
|
| 24 |
+
return res
|
src/score.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from resume import CVParse
|
| 2 |
+
from github import relevance_filter
|
| 3 |
+
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
def init():
|
| 7 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
|
| 8 |
+
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
|
| 9 |
+
|
| 10 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
+
gen_pipeline = pipeline(task="text-generation", model=model, tokenizer = tokenizer)
|
| 12 |
+
|
| 13 |
+
def run(CV, Repo_link):
|
| 14 |
+
return "hello world"
|