Spaces:

Ayo42
/

FitCV

Build error

App Files Files Community

Ayo42 commited on Jul 22, 2025

Commit

ec97e44

verified ·

1 Parent(s): bc7b5ad

Upload 9 files

Browse files

Files changed (9) hide show

src/FitCV.code-workspace +8 -0
src/app.py +73 -0
src/assets/logo.png +0 -0
src/environment.yml +12 -0
src/github/RepoParse.py +50 -0
src/github/languages.json +0 -0
src/github/relevance_filter.py +68 -0
src/resume/CVParse.py +24 -0
src/score.py +14 -0

src/FitCV.code-workspace ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+	"folders": [
+		{
+			"path": ".."
+		}
+	],
+	"settings": {}
+}

src/app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import streamlit as st
+from io import BytesIO
+# === App Configuration ===
+st.set_page_config(
+    page_title="FitCV - AI Resume Optimizer",
+    page_icon="📄",
+    layout="wide"
+)
+# === Header Section ===
+col1, col2 = st.columns([1, 6])
+with col1:
+    st.image("assets/logo.png", width=40)  # Replace with your own logo
+with col2:
+    st.markdown(
+        "<h1 style='font-size: 32px; display: inline;'>FitCV</h1><br>"
+        "<span style='font-size:16px; color: gray;'>Your AI-Powered Resume Optimizer</span>",
+        unsafe_allow_html=True
+    )
+st.markdown("---")
+# === Input Section ===
+st.header("🔍 Optimize Your Resume for the Job You Want")
+uploaded_cv = st.file_uploader("📄 Upload Your CV (PDF only)", type=["pdf"])
+github_username = st.text_input("🐙 Enter Your GitHub Username")
+job_mode = st.radio("What job info do you have?", ["Job Title", "Full Job Posting"])
+if job_mode == "Job Title":
+    job_input = st.text_input("💼 Enter the Job Title (e.g., Data Scientist at Google)")
+else:
+    job_input = st.text_area("💬 Paste the Full Job Posting")
+# === Button ===
+if st.button("✨ Optimize Resume"):
+    if not uploaded_cv or not github_username or not job_input.strip():
+        st.warning("Please fill in all fields and upload a valid resume.")
+    else:
+        with st.spinner("Analyzing your GitHub, resume, and job requirements..."):
+            # === Placeholder: Call to your main pipeline logic ===
+            from src.pipeline.recommend import refine_resume  # Adjust path accordingly
+            # Simulate reading bytes
+            resume_bytes = uploaded_cv.read()
+            result = refine_resume(
+                resume_pdf_bytes=resume_bytes,
+                github_username=github_username,
+                job_input=job_input
+            )
+        # === Output Section ===
+        st.success("✅ Resume optimization complete!")
+        st.subheader("📝 Refined Professional Summary")
+        st.write(result["summary"])
+        st.subheader("🧠 Recommended Skills for This Role")
+        st.write(", ".join(result["skills"]))
+        st.subheader("🚀 GitHub Projects to Highlight")
+        for proj in result["projects"]:
+            st.markdown(f"- **[{proj['name']}]({proj['url']})**: {proj['description']}")
+        st.markdown("---")
+        st.info("These suggestions are tailored to the job you provided. You can now update your CV accordingly!")

src/assets/logo.png ADDED Viewed

src/environment.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: huggingface-env
+dependencies:
+  - python=3.11.2
+  - pip
+  - pip:
+    - spacy
+    - spacy-layoutlm
+    - transformers
+    - streamlit
+    - torch
+    - datasets
+    - azureml-defaults

src/github/RepoParse.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import requests
+import json
+import re
+import pandas as pd
+with open("/content/languages.json") as f:
+  languages = json.load(f)
+token = "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"
+headers = {"Authorization": f"token ghp_{token}"}
+res = requests.get("https://api.github.com/users/Eben113/repos", headers=headers)
+js1 = res.json()[0]
+url = js1["url"] + "/contents"
+res1 = requests.get(url)
+def buildLis():
+  lis = [{"name": dict_["name"], "url": dict_["url"], "langURL": dict_["languages_url"]} for dict_ in res.json()]
+  return lis
+def scanJson(name, url, langURL):
+  extensions = []
+  for language in requests.get(langURL).json():
+    exts = languages.get(language.title(), None)
+    if exts:
+      extensions.extend(exts["extensions"])
+    else:
+      extensions.append("."+language)
+  files = requests.get(url  + "/contents").json()
+  def walk(js, prefix):
+    res = {}
+    for branch in js:
+      if branch["name"] == "README.md":
+        res[prefix + "/" + "readme"] = branch["url"]
+      elif branch["type"] == "file" and (("."+branch["name"].split(".")[-1]) in extensions):
+        res[prefix + "/" + branch["name"]] = branch["url"]
+      elif branch["type"] == "dir":
+        res.update(walk(requests.get(branch["url"]).json(), prefix + "/" + branch["name"]))
+    return res
+  info = walk(files, name)
+  return info
+def buildDataset(repo_list):
+  data = []
+  for repo in repo_list:
+    records = scanJson(repo["name"], repo["url"], repo["langURL"])
+    for dir, url in records.items():
+      data.append({"repo": repo["name"], "directory": dir, "url": url})
+  data = pd.DataFrame(data)
+  return data

src/github/languages.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/github/relevance_filter.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import requests
+from datasets import load_dataset, Dataset
+import pandas as pd
+import base64
+token = "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"
+headers = {"Authorization": f"token ghp_{token}"}
+def parse_repo(df, tokenizer, gen_pipeline, role):
+  def return_role_prompts(role, task, files, tokenize=False):
+    task_prompt = {"describe" :  "return a one paragraph project description that shows how this project displays skills that make me good for this job role or posting:", "skill" : "return a list of skills that were showcased in this project and that recruiters look for when looking people for this job role or posting in this format 1. :"}
+    assistant_prompt = {"describe":"Suggested project title: ", "skill":"1. "}
+    prompts =  tokenizer.apply_chat_template([{"role":"system", "content":"You are a CV coach with expertise in editing CVs for people who work in tech"},
+                                                            {"role":"user", "content":"These are some (or all) of the files for a particular project I worked on (written in the format Directory:File_1_directory||Content:File_1_content, Directory:File_2_directory||Content:File_2_content,...):"+files+ task_prompt[task] +role+" and would be a good thing to put on my CV"},
+                                                            {"role":"assistant", "content":assistant_prompt[task]}], tokenize = tokenize, continue_final_message=True)
+    return prompts
+  def chunk(text, max_length):
+    chunks = []
+    tokens = tokenizer.encode(text)
+    current = 0
+    for i in range(len(tokens)//max_length):
+      chunks.append(tokenizer.decode(tokens[current : current+max_length]))
+      current += max_length
+    chunks.append(tokenizer.decode(tokens[current:]))
+    return chunks
+  def collate(df, role):
+    projects = set(df.repo)
+    res = []
+    for project in projects:
+      current_files = ""
+      skills, describe = [], []
+      for i, row in df[df["repo"] == project].iterrows():
+        if len(return_role_prompts(role, "skill", current_files, True)) >= 7000:
+          skills.extend([return_role_prompts(role, "skill", string, False) for string in chunk(current_files, 2500)])
+          describe.extend([return_role_prompts(role, "describe", string, False) for string in chunk(current_files, 2500)])
+          current_files = ""
+        doc = requests.get(row["url"], headers=headers).json()
+        doc = base64.b64decode(doc["content"])
+        new = "Directory:{}||Content:{}".format(row["directory"], doc.decode("utf-8"))
+        if len(return_role_prompts(role, "skill", current_files+new, True)) <= 2500:
+          current_files += new
+        else:
+          skills.append(return_role_prompts(role, "skill", current_files, False))
+          describe.append(return_role_prompts(role, "describe", current_files, False))
+          current_files = new
+      res.append({"repo":project, "description":describe, "skills":skills})
+    return pd.DataFrame(res)
+#  collate(firstJs, "Machine Learning Engineer")
+  """def truncate(prompt_rec):
+    des_index = prompt_rec["description"].find(prompt_rec["cut1"]) + len(prompt_rec["cut1"]) - 1
+    ski_index = prompt_rec["skills"].find(prompt_rec["cut2"]) + len(prompt_rec["cut2"]) - 1
+    return {'description': prompt_rec["description"][des_index:], 'skills':prompt_rec["skills"][ski_index:]}
+  prompts = df.apply(func, axis=1, result_type="expand")
+  prompts.columns = ("repo",  "prompt1", "prompt2", "cut1", "cut2")
+  prompts = Dataset.from_pandas(prompts)
+  descriptions = gen_pipeline(prompts["prompt1"], max_new_tokens=100)
+  skills = gen_pipeline(prompts["prompt2"], max_new_tokens=50)
+  res = Dataset.from_dict({"repo": prompts["repo"], "description":descriptions, "skills":skills, "cut1":prompts["cut1"], "cut2":prompts["cut2"]})
+  res = res.map(lambda x: {'description': x['description'][0]['generated_text'],
+                    'skills': x['skills'][0]['generated_text']})
+  res = res.map(truncate)
+  return res"""
+  collate(df, role)

src/resume/CVParse.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import spacy
+from spacy_layout import spaCyLayout
+nlp = spacy.load("en_core_web_md")
+layout = spaCyLayout(nlp)
+doc = layout("/content/EbenezerMLEResume (1).pdf")
+[i for i in doc.spans["layout"] if i.label_ == "section_header"]
+def extract(string, doc):
+  headings = [i for i in doc.spans["layout"] if i.label_ == "section_header"]
+  res, match_ = [], headings[0]
+  token = nlp(string)
+  print(match_.similarity(token))
+  for heading in headings[1:]:
+    print(heading.similarity(token))
+    if heading.similarity(token) > match_.similarity(token):
+      match_ = heading
+  ind = headings.index(match_)
+  start, stop = match_.id, headings[ind+1].id
+  for j in range(start+1, stop):
+    res.append(doc.spans["layout"][j])
+  return res

src/score.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from resume import CVParse
+from github import relevance_filter
+from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
+import torch
+def init():
+    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
+    model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    gen_pipeline = pipeline(task="text-generation", model=model, tokenizer = tokenizer)
+def run(CV, Repo_link):
+    return "hello world"