Spaces:
Sleeping
Sleeping
| from tqdm import tqdm | |
| import re | |
| import gradio as gr | |
| import os | |
| import accelerate | |
| # import spaces | |
| import subprocess | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| from docling.document_converter import DocumentConverter | |
| from huggingface_hub import login | |
| login(token = os.getenv('HF_TOKEN')) | |
| repo_id = "SyntheticIAI/CVCRaft" | |
| model_id = "fine_tuned_llama.gguf" | |
| hf_hub_download( | |
| repo_id=repo_id, | |
| filename=model_id, | |
| local_dir = "./models" | |
| ) | |
| def process_document(pdf_path): | |
| extracted_pages = extract_pages(pdf_path) | |
| page2content = {} | |
| for extracted_page in tqdm(extracted_pages): | |
| page_id = extracted_page.pageid | |
| content = process_page(extracted_page) | |
| page2content[page_id] = content | |
| return page2content | |
| def process_page(extracted_page): | |
| content = [] | |
| elements = [element for element in extracted_page._objs] | |
| elements.sort(key=lambda a: a.y1, reverse=True) | |
| for i, element in enumerate(elements): | |
| if isinstance(element, LTTextContainer): | |
| line_text = extract_text_and_normalize(element) | |
| content.append(line_text) | |
| content = re.sub('\n+', '\n', ''.join(content)) | |
| return content | |
| def extract_text_and_normalize(element): | |
| # Extract text from line and split it with new lines | |
| line_texts = element.get_text().split('\n') | |
| norm_text = '' | |
| for line_text in line_texts: | |
| line_text = line_text.strip() | |
| if not line_text: | |
| line_text = '\n' | |
| else: | |
| line_text = re.sub('\s+', ' ', line_text) | |
| if not re.search('[\w\d\,\-]', line_text[-1]): | |
| line_text += '\n' | |
| else: | |
| line_text += ' ' | |
| norm_text += line_text | |
| return norm_text | |
| def txt_to_html(text): | |
| html_content = "<html><body>" | |
| for line in text.split('\n'): | |
| html_content += "<p>{}</p>".format(line.strip()) | |
| html_content += "</body></html>" | |
| return html_content | |
| def craft_cv(llm, prompt, maxtokens, temperature, top_probability): | |
| # def craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability): | |
| instruction = "Given input CV and job description. Please revise the CV according to the given job description and output the revised CV." | |
| output = llm.create_chat_completion( | |
| messages=[ | |
| # {"from": "user", "value": instruction + ' Input CV: ' + cv_text + ' , Job Description: ' + job_description}, | |
| {"from": "user", "value": prompt}, | |
| ], | |
| max_tokens=maxtokens, | |
| temperature=temperature | |
| ) | |
| output = output['choices'][0]['message']['content'] | |
| cv_text='' | |
| return cv_text, output | |
| def convert_to_json(llm, cv_text, maxtokens, temperature, top_probability): | |
| json_format = """ | |
| You are an expert at structuring resumes in JSON format. Given a modified resume text, extract the relevant details and convert them into the following structured JSON format: | |
| { | |
| "profileDetails": { | |
| "firstName": "", | |
| "lastName": "", | |
| "email": "", | |
| "contact": "", | |
| "country": "", | |
| "jobTitle": "", | |
| "social": "", | |
| "profileDesc": "", | |
| "address": "", | |
| "city": "", | |
| "state": "", | |
| "zipCode": "" | |
| }, | |
| "professionalExperience": [ | |
| { | |
| "positionTitle": "", | |
| "location": "", | |
| "company": "", | |
| "description": "", | |
| "startDate": "", | |
| "endDate": "" | |
| } | |
| ], | |
| "education": [ | |
| { | |
| "institute": "", | |
| "schoolLocation": "", | |
| "degree": "", | |
| "field": "", | |
| "grade": "", | |
| "startDate": "", | |
| "endDate": "" | |
| } | |
| ], | |
| "skills": [""], | |
| "hobbies": [""], | |
| "languages": [""], | |
| "certifications": [""], | |
| "projects": [ | |
| { | |
| "title": "", | |
| "description": "" | |
| } | |
| ], | |
| "jobPreferences": { | |
| "compTarget": "", | |
| "strength": "", | |
| "roleTarget": "" | |
| }, | |
| "jobDescription": "" | |
| } | |
| Instructions: | |
| - Extract details accurately from the given resume. | |
| - Ensure proper structuring of dates, responsibilities, and projects. | |
| - If a field is missing in the input, leave it as an empty string or an empty list where applicable. | |
| - Maintain proper formatting and avoid unnecessary additions. | |
| Provide the response in a valid JSON format with no additional explanations. | |
| """ | |
| output = llm.create_chat_completion( | |
| messages=[ | |
| {"from": "user", "value": json_format + ' CV text: ' + cv_text}, | |
| ], | |
| max_tokens=maxtokens, | |
| temperature=temperature | |
| ) | |
| output = output['choices'][0]['message']['content'] | |
| return output | |
| def pdf_to_text(prompt, maxtokens=2048, temperature=0, top_probability=0.95): | |
| # def pdf_to_text(cv_file, job_description, llm_type='Fine tuned Llama3', maxtokens=2048, temperature=0, top_probability=0.95): | |
| # converter = DocumentConverter() | |
| # result = converter.convert(cv_file) | |
| # cv_text = result.document.export_to_markdown() | |
| # if(llm_type=='Fine tuned Llama3'): | |
| llm = Llama( | |
| model_path="models/" + model_id, | |
| flash_attn=True, | |
| n_gpu_layers=81, | |
| n_batch=1024, | |
| n_ctx=8192, | |
| ) | |
| print('MAX TONENS IS ',maxtokens) | |
| # cv_text, crafted_cv = craft_cv(llm, cv_text, job_description, maxtokens, temperature, top_probability) | |
| # print('CRAFTED CV IS ',crafted_cv) | |
| cv_text, crafted_cv = craft_cv(llm, prompt, maxtokens, temperature, top_probability) | |
| crafted_cv = convert_to_json(llm, crafted_cv, maxtokens, temperature, top_probability) | |
| # print('FINAL CV IS ',crafted_cv) | |
| return crafted_cv | |
| temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value") | |
| prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value") | |
| max_tokens = gr.Number(value=600, label="Max Tokens") | |
| cv_file = gr.File(label='Upload the CV') | |
| prompt_text = gr.Textbox(label='Enter the job description') | |
| output_text = gr.Textbox() | |
| llm_type = gr.Radio(["Fine tuned Llama3"]) | |
| iface = gr.Interface( | |
| fn=pdf_to_text, | |
| # inputs=[cv_file, prompt_text, llm_type], | |
| inputs=['text'], | |
| outputs=['text'], | |
| title='Craft CV', | |
| description="This application assists to customize CV based on input job description", | |
| theme=gr.themes.Soft(), | |
| ) | |
| iface.launch() |