Ayo42 commited on
Commit
ec97e44
·
verified ·
1 Parent(s): bc7b5ad

Upload 9 files

Browse files
src/FitCV.code-workspace ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": ".."
5
+ }
6
+ ],
7
+ "settings": {}
8
+ }
src/app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import BytesIO
3
+
4
+ # === App Configuration ===
5
+ st.set_page_config(
6
+ page_title="FitCV - AI Resume Optimizer",
7
+ page_icon="📄",
8
+ layout="wide"
9
+ )
10
+
11
+ # === Header Section ===
12
+ col1, col2 = st.columns([1, 6])
13
+
14
+ with col1:
15
+ st.image("assets/logo.png", width=40) # Replace with your own logo
16
+
17
+ with col2:
18
+ st.markdown(
19
+ "<h1 style='font-size: 32px; display: inline;'>FitCV</h1><br>"
20
+ "<span style='font-size:16px; color: gray;'>Your AI-Powered Resume Optimizer</span>",
21
+ unsafe_allow_html=True
22
+ )
23
+
24
+ st.markdown("---")
25
+
26
+ # === Input Section ===
27
+ st.header("🔍 Optimize Your Resume for the Job You Want")
28
+
29
+ uploaded_cv = st.file_uploader("📄 Upload Your CV (PDF only)", type=["pdf"])
30
+
31
+ github_username = st.text_input("🐙 Enter Your GitHub Username")
32
+
33
+ job_mode = st.radio("What job info do you have?", ["Job Title", "Full Job Posting"])
34
+
35
+ if job_mode == "Job Title":
36
+ job_input = st.text_input("💼 Enter the Job Title (e.g., Data Scientist at Google)")
37
+ else:
38
+ job_input = st.text_area("💬 Paste the Full Job Posting")
39
+
40
+ # === Button ===
41
+ if st.button("✨ Optimize Resume"):
42
+ if not uploaded_cv or not github_username or not job_input.strip():
43
+ st.warning("Please fill in all fields and upload a valid resume.")
44
+ else:
45
+ with st.spinner("Analyzing your GitHub, resume, and job requirements..."):
46
+
47
+ # === Placeholder: Call to your main pipeline logic ===
48
+ from src.pipeline.recommend import refine_resume # Adjust path accordingly
49
+
50
+ # Simulate reading bytes
51
+ resume_bytes = uploaded_cv.read()
52
+
53
+ result = refine_resume(
54
+ resume_pdf_bytes=resume_bytes,
55
+ github_username=github_username,
56
+ job_input=job_input
57
+ )
58
+
59
+ # === Output Section ===
60
+ st.success("✅ Resume optimization complete!")
61
+
62
+ st.subheader("📝 Refined Professional Summary")
63
+ st.write(result["summary"])
64
+
65
+ st.subheader("🧠 Recommended Skills for This Role")
66
+ st.write(", ".join(result["skills"]))
67
+
68
+ st.subheader("🚀 GitHub Projects to Highlight")
69
+ for proj in result["projects"]:
70
+ st.markdown(f"- **[{proj['name']}]({proj['url']})**: {proj['description']}")
71
+
72
+ st.markdown("---")
73
+ st.info("These suggestions are tailored to the job you provided. You can now update your CV accordingly!")
src/assets/logo.png ADDED
src/environment.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: huggingface-env
2
+ dependencies:
3
+ - python=3.11.2
4
+ - pip
5
+ - pip:
6
+ - spacy
7
+ - spacy-layoutlm
8
+ - transformers
9
+ - streamlit
10
+ - torch
11
+ - datasets
12
+ - azureml-defaults
src/github/RepoParse.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ import pandas as pd
5
+
6
+ with open("/content/languages.json") as f:
7
+ languages = json.load(f)
8
+
9
+ token = "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"
10
+ headers = {"Authorization": f"token ghp_{token}"}
11
+
12
+ res = requests.get("https://api.github.com/users/Eben113/repos", headers=headers)
13
+
14
+ js1 = res.json()[0]
15
+ url = js1["url"] + "/contents"
16
+ res1 = requests.get(url)
17
+ def buildLis():
18
+ lis = [{"name": dict_["name"], "url": dict_["url"], "langURL": dict_["languages_url"]} for dict_ in res.json()]
19
+ return lis
20
+
21
+ def scanJson(name, url, langURL):
22
+ extensions = []
23
+ for language in requests.get(langURL).json():
24
+ exts = languages.get(language.title(), None)
25
+ if exts:
26
+ extensions.extend(exts["extensions"])
27
+ else:
28
+ extensions.append("."+language)
29
+ files = requests.get(url + "/contents").json()
30
+ def walk(js, prefix):
31
+ res = {}
32
+ for branch in js:
33
+ if branch["name"] == "README.md":
34
+ res[prefix + "/" + "readme"] = branch["url"]
35
+ elif branch["type"] == "file" and (("."+branch["name"].split(".")[-1]) in extensions):
36
+ res[prefix + "/" + branch["name"]] = branch["url"]
37
+ elif branch["type"] == "dir":
38
+ res.update(walk(requests.get(branch["url"]).json(), prefix + "/" + branch["name"]))
39
+ return res
40
+ info = walk(files, name)
41
+ return info
42
+
43
+ def buildDataset(repo_list):
44
+ data = []
45
+ for repo in repo_list:
46
+ records = scanJson(repo["name"], repo["url"], repo["langURL"])
47
+ for dir, url in records.items():
48
+ data.append({"repo": repo["name"], "directory": dir, "url": url})
49
+ data = pd.DataFrame(data)
50
+ return data
src/github/languages.json ADDED
The diff for this file is too large to render. See raw diff
 
src/github/relevance_filter.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from datasets import load_dataset, Dataset
3
+ import pandas as pd
4
+ import base64
5
+ token = "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"
6
+ headers = {"Authorization": f"token ghp_{token}"}
7
+
8
+ def parse_repo(df, tokenizer, gen_pipeline, role):
9
+ def return_role_prompts(role, task, files, tokenize=False):
10
+ task_prompt = {"describe" : "return a one paragraph project description that shows how this project displays skills that make me good for this job role or posting:", "skill" : "return a list of skills that were showcased in this project and that recruiters look for when looking people for this job role or posting in this format 1. :"}
11
+ assistant_prompt = {"describe":"Suggested project title: ", "skill":"1. "}
12
+ prompts = tokenizer.apply_chat_template([{"role":"system", "content":"You are a CV coach with expertise in editing CVs for people who work in tech"},
13
+ {"role":"user", "content":"These are some (or all) of the files for a particular project I worked on (written in the format Directory:File_1_directory||Content:File_1_content, Directory:File_2_directory||Content:File_2_content,...):"+files+ task_prompt[task] +role+" and would be a good thing to put on my CV"},
14
+ {"role":"assistant", "content":assistant_prompt[task]}], tokenize = tokenize, continue_final_message=True)
15
+ return prompts
16
+
17
+
18
+ def chunk(text, max_length):
19
+ chunks = []
20
+ tokens = tokenizer.encode(text)
21
+ current = 0
22
+ for i in range(len(tokens)//max_length):
23
+ chunks.append(tokenizer.decode(tokens[current : current+max_length]))
24
+ current += max_length
25
+ chunks.append(tokenizer.decode(tokens[current:]))
26
+ return chunks
27
+
28
+
29
+ def collate(df, role):
30
+ projects = set(df.repo)
31
+ res = []
32
+ for project in projects:
33
+ current_files = ""
34
+ skills, describe = [], []
35
+ for i, row in df[df["repo"] == project].iterrows():
36
+ if len(return_role_prompts(role, "skill", current_files, True)) >= 7000:
37
+ skills.extend([return_role_prompts(role, "skill", string, False) for string in chunk(current_files, 2500)])
38
+ describe.extend([return_role_prompts(role, "describe", string, False) for string in chunk(current_files, 2500)])
39
+ current_files = ""
40
+ doc = requests.get(row["url"], headers=headers).json()
41
+ doc = base64.b64decode(doc["content"])
42
+ new = "Directory:{}||Content:{}".format(row["directory"], doc.decode("utf-8"))
43
+ if len(return_role_prompts(role, "skill", current_files+new, True)) <= 2500:
44
+ current_files += new
45
+ else:
46
+ skills.append(return_role_prompts(role, "skill", current_files, False))
47
+ describe.append(return_role_prompts(role, "describe", current_files, False))
48
+ current_files = new
49
+ res.append({"repo":project, "description":describe, "skills":skills})
50
+ return pd.DataFrame(res)
51
+ # collate(firstJs, "Machine Learning Engineer")
52
+
53
+ """def truncate(prompt_rec):
54
+ des_index = prompt_rec["description"].find(prompt_rec["cut1"]) + len(prompt_rec["cut1"]) - 1
55
+ ski_index = prompt_rec["skills"].find(prompt_rec["cut2"]) + len(prompt_rec["cut2"]) - 1
56
+ return {'description': prompt_rec["description"][des_index:], 'skills':prompt_rec["skills"][ski_index:]}
57
+
58
+ prompts = df.apply(func, axis=1, result_type="expand")
59
+ prompts.columns = ("repo", "prompt1", "prompt2", "cut1", "cut2")
60
+ prompts = Dataset.from_pandas(prompts)
61
+ descriptions = gen_pipeline(prompts["prompt1"], max_new_tokens=100)
62
+ skills = gen_pipeline(prompts["prompt2"], max_new_tokens=50)
63
+ res = Dataset.from_dict({"repo": prompts["repo"], "description":descriptions, "skills":skills, "cut1":prompts["cut1"], "cut2":prompts["cut2"]})
64
+ res = res.map(lambda x: {'description': x['description'][0]['generated_text'],
65
+ 'skills': x['skills'][0]['generated_text']})
66
+ res = res.map(truncate)
67
+ return res"""
68
+ collate(df, role)
src/resume/CVParse.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy_layout import spaCyLayout
3
+
4
+ nlp = spacy.load("en_core_web_md")
5
+ layout = spaCyLayout(nlp)
6
+
7
+ doc = layout("/content/EbenezerMLEResume (1).pdf")
8
+
9
+ [i for i in doc.spans["layout"] if i.label_ == "section_header"]
10
+
11
+ def extract(string, doc):
12
+ headings = [i for i in doc.spans["layout"] if i.label_ == "section_header"]
13
+ res, match_ = [], headings[0]
14
+ token = nlp(string)
15
+ print(match_.similarity(token))
16
+ for heading in headings[1:]:
17
+ print(heading.similarity(token))
18
+ if heading.similarity(token) > match_.similarity(token):
19
+ match_ = heading
20
+ ind = headings.index(match_)
21
+ start, stop = match_.id, headings[ind+1].id
22
+ for j in range(start+1, stop):
23
+ res.append(doc.spans["layout"][j])
24
+ return res
src/score.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from resume import CVParse
2
+ from github import relevance_filter
3
+ from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
4
+ import torch
5
+
6
+ def init():
7
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
8
+ model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ gen_pipeline = pipeline(task="text-generation", model=model, tokenizer = tokenizer)
12
+
13
+ def run(CV, Repo_link):
14
+ return "hello world"