Spaces:

SyntheticIAI
/

CVCraft

Sleeping

File size: 7,191 Bytes

8de5910
 
 
 
 
2571271
8de5910
 
 
0f709ec
8de5910
5b9bace
 
 
 
745617e
e95d73d
8e3121f
8de5910
 
5cbefa7
 
8de5910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a01bfa9
 
5cbefa7
8de5910
 
a01bfa9
 
8de5910
 
 
 
 
513961e
f2bc9af
8de5910
a03114f
5c7d67b
 
1fe0fb7
5c7d67b
fa66265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c7d67b
 
fa66265
 
5c7d67b
fa66265
 
 
 
 
 
 
5c7d67b
1fe0fb7
5c7d67b
 
 
 
 
1fe0fb7
5c7d67b
 
1fe0fb7
 
5c7d67b
1fe0fb7
 
 
 
 
 
 
a01bfa9
 
1fe0fb7
a01bfa9
 
 
3f31391
9ce69b5
 
3f31391
 
 
 
 
 
808b505
a01bfa9
 
 
a03114f
a01bfa9
5cbefa7
8de5910
 
 
 
a95e421
1a00369
8de5910
ba1506c
8de5910
 
a01bfa9
 
5cbefa7
8de5910
f3c0445
8de5910
 
3fe49a8

from tqdm import tqdm
import re
import gradio as gr
import os
import accelerate
# import spaces
import subprocess
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from docling.document_converter import DocumentConverter

from huggingface_hub import login

login(token = os.getenv('HF_TOKEN'))

repo_id = "SyntheticIAI/CVCRaft"
model_id = "fine_tuned_llama.gguf"


hf_hub_download(
    repo_id=repo_id,
    filename=model_id,
    local_dir = "./models"
)

def process_document(pdf_path):
    extracted_pages = extract_pages(pdf_path)
    page2content = {}
    for extracted_page in tqdm(extracted_pages):
        page_id = extracted_page.pageid
        content = process_page(extracted_page)
        page2content[page_id] = content
    return page2content


def process_page(extracted_page):
    content = []
    elements = [element for element in extracted_page._objs]
    elements.sort(key=lambda a: a.y1, reverse=True)
    for i, element in enumerate(elements):
        if isinstance(element, LTTextContainer):
            line_text = extract_text_and_normalize(element)
            content.append(line_text)
    content = re.sub('\n+', '\n', ''.join(content))
    return content


def extract_text_and_normalize(element):
    # Extract text from line and split it with new lines
    line_texts = element.get_text().split('\n')
    norm_text = ''
    for line_text in line_texts:
        line_text = line_text.strip()
        if not line_text:
            line_text = '\n'
        else:
            line_text = re.sub('\s+', ' ', line_text)
            if not re.search('[\w\d\,\-]', line_text[-1]):
                line_text += '\n'
            else:
                line_text += ' '
        norm_text += line_text
    return norm_text


def txt_to_html(text):
    html_content = "<html><body>"
    for line in text.split('\n'):
        html_content += "<p>{}</p>".format(line.strip())
    html_content += "</body></html>"
    return html_content

def craft_cv(llm, prompt, maxtokens, temperature, top_probability):
# def craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability):
    instruction = "Given input CV and job description. Please revise the CV according to the given job description and output the revised CV."
    output = llm.create_chat_completion(
        messages=[
            # {"from": "user", "value": instruction + ' Input CV: ' + cv_text + ' , Job Description: ' + job_description},
            {"from": "user", "value": prompt},
        ],
        max_tokens=maxtokens,
        temperature=temperature
    )
    output = output['choices'][0]['message']['content']
    cv_text=''
    return cv_text, output

def convert_to_json(llm, cv_text, maxtokens, temperature, top_probability):
    json_format = """
            You are an expert at structuring resumes in JSON format. Given a modified resume text, extract the relevant details and convert them into the following structured JSON format:
            
            {
                  "profileDetails": {
                    "firstName": "",
                    "lastName": "",
                    "email": "",
                    "contact": "",
                    "country": "",
                    "jobTitle": "",
                    "social": "",
                    "profileDesc": "",
                    "address": "",
                    "city": "",
                    "state": "",
                    "zipCode": ""
                  },
                  "professionalExperience": [
                    {
                      "positionTitle": "",
                      "location": "",
                      "company": "",
                      "description": "",
                      "startDate": "",
                      "endDate": ""
                    }
                  ],
                  "education": [
                    {
                      "institute": "",
                      "schoolLocation": "",
                      "degree": "",
                      "field": "",
                      "grade": "",
                      "startDate": "",
                      "endDate": ""
                    }
                  ],
                  "skills": [""],
                  "hobbies": [""],
                  "languages": [""],
                  "certifications": [""],
                  "projects": [
                    {
                      "title": "",
                      "description": ""
                    }
                  ],
                  "jobPreferences": {
                    "compTarget": "",
                    "strength": "",
                    "roleTarget": ""
                  },
                  "jobDescription": ""
                }
            
            Instructions:
            - Extract details accurately from the given resume.
            - Ensure proper structuring of dates, responsibilities, and projects.
            - If a field is missing in the input, leave it as an empty string or an empty list where applicable.
            - Maintain proper formatting and avoid unnecessary additions.
            
            Provide the response in a valid JSON format with no additional explanations.
            """
    output = llm.create_chat_completion(
        messages=[
            {"from": "user", "value": json_format + ' CV text: ' + cv_text},
        ],
        max_tokens=maxtokens,
        temperature=temperature
    )
    output = output['choices'][0]['message']['content']
    return output 

def pdf_to_text(prompt, maxtokens=2048, temperature=0, top_probability=0.95):
# def pdf_to_text(cv_file, job_description, llm_type='Fine tuned Llama3', maxtokens=2048, temperature=0, top_probability=0.95):
    
    # converter = DocumentConverter()
    # result = converter.convert(cv_file)
    # cv_text = result.document.export_to_markdown()

    # if(llm_type=='Fine tuned Llama3'):
    llm = Llama(
            model_path="models/" + model_id,
            flash_attn=True,
            n_gpu_layers=81,
            n_batch=1024,
            n_ctx=8192,
        )
    print('MAX TONENS IS ',maxtokens)
    # cv_text, crafted_cv = craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability)
    # print('CRAFTED CV IS ',crafted_cv)
    cv_text, crafted_cv = craft_cv(llm, prompt, maxtokens, temperature, top_probability)
    crafted_cv = convert_to_json(llm, crafted_cv, maxtokens, temperature, top_probability)
    # print('FINAL CV IS ',crafted_cv)
    return crafted_cv

temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
max_tokens = gr.Number(value=600, label="Max Tokens")
cv_file = gr.File(label='Upload the CV')
prompt_text = gr.Textbox(label='Enter the job description')
output_text = gr.Textbox()
llm_type = gr.Radio(["Fine tuned Llama3"])
iface = gr.Interface(
    fn=pdf_to_text,
    # inputs=[cv_file, prompt_text, llm_type],
    inputs=['text'],
    outputs=['text'],
    title='Craft CV',
    description="This application assists to customize CV based on input job description",
    theme=gr.themes.Soft(),
)
iface.launch()