Spaces:
Sleeping
Sleeping
File size: 3,640 Bytes
911e595 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import streamlit as st
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import concurrent.futures
# Download the model (do this only once, outside of any function)
@st.cache_resource
def load_model():
model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)
# Initialize models
llm = load_model()
ocr_model = ocr_predictor(pretrained=True)
@st.cache_data
def extract_text(pdf_bytes):
doc = DocumentFile.from_pdf(pdf_bytes)
result = ocr_model(doc)
return " ".join(word.value for page in result.pages for block in page.blocks for line in block.lines for word in line.words)
def check_cv_section(section, text):
prompt = f"""Analyze the following CV text and determine if the "{section}" section exists.
Respond with 'true' if it exists, or 'false' if it doesn't.
Be aware of synonyms and variations in section titles.
CV text:
{text}
Respond in the format:
{section}: true/false
Explanation: Briefly explain your reasoning, mentioning any relevant keywords or phrases found.
"""
response = llm(prompt, max_tokens=200)
result = response['choices'][0]['text'].strip()
parts = result.split('\n')
presence = parts[0].split(':')[1].strip().lower() == 'true'
explanation = parts[1].split(':', 1)[1].strip() if len(parts) > 1 else ""
return {section: presence}, {section: explanation}
def check_cv_sections(text):
sections = [
"Personal Information",
"Summary and objective (About / profile)",
"Education",
"Work Experience",
"Skills",
"Languages",
"Certificates",
"Interests",
"References (optional)"
]
results = {}
explanations = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_section = {executor.submit(check_cv_section, section, text): section for section in sections}
for future in concurrent.futures.as_completed(future_to_section):
section_result, section_explanation = future.result()
results.update(section_result)
explanations.update(section_explanation)
return results, explanations
def calculate_cv_score(sections):
essentials_sections = {
"Profile | Summary": 1,
"Skill|Expertise|Competencies": 4,
"Education": 5,
"Projects": 5,
"Professional experience": 5,
"Languages": 2
}
return sum(value for essential, value in essentials_sections.items()
if any(s.lower() in essential.lower() for s in sections if sections[s]))
def main():
st.title('Analyse de CV avec DocTR et Mistral')
uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
if uploaded_file is not None:
pdf_bytes = uploaded_file.read()
text = extract_text(pdf_bytes)
if st.checkbox("Afficher le texte extrait du CV"):
st.text_area("Texte extrait du CV", text, height=200)
sections, explanations = check_cv_sections(text)
cv_score = calculate_cv_score(sections)
st.header("CV Completeness")
for section, present in sections.items():
st.write(f"{section}: {present}")
if explanations[section]:
st.write(f"Explanation: {explanations[section]}")
st.write("---")
st.header(f'CV Score: {cv_score}')
if __name__ == '__main__':
main() |