Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from doctr.models import ocr_predictor
|
| 3 |
+
from doctr.io import DocumentFile
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
from llama_cpp import Llama
|
| 6 |
+
import concurrent.futures
|
| 7 |
+
|
| 8 |
+
# Download the model (do this only once, outside of any function)
|
| 9 |
+
@st.cache_resource
|
| 10 |
+
def load_model():
|
| 11 |
+
model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
|
| 12 |
+
return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)
|
| 13 |
+
|
| 14 |
+
# Initialize models
|
| 15 |
+
llm = load_model()
|
| 16 |
+
ocr_model = ocr_predictor(pretrained=True)
|
| 17 |
+
|
| 18 |
+
@st.cache_data
|
| 19 |
+
def extract_text(pdf_bytes):
|
| 20 |
+
doc = DocumentFile.from_pdf(pdf_bytes)
|
| 21 |
+
result = ocr_model(doc)
|
| 22 |
+
return " ".join(word.value for page in result.pages for block in page.blocks for line in block.lines for word in line.words)
|
| 23 |
+
|
| 24 |
+
def check_cv_section(section, text):
|
| 25 |
+
prompt = f"""Analyze the following CV text and determine if the "{section}" section exists.
|
| 26 |
+
Respond with 'true' if it exists, or 'false' if it doesn't.
|
| 27 |
+
Be aware of synonyms and variations in section titles.
|
| 28 |
+
CV text:
|
| 29 |
+
{text}
|
| 30 |
+
Respond in the format:
|
| 31 |
+
{section}: true/false
|
| 32 |
+
Explanation: Briefly explain your reasoning, mentioning any relevant keywords or phrases found.
|
| 33 |
+
"""
|
| 34 |
+
response = llm(prompt, max_tokens=200)
|
| 35 |
+
result = response['choices'][0]['text'].strip()
|
| 36 |
+
parts = result.split('\n')
|
| 37 |
+
presence = parts[0].split(':')[1].strip().lower() == 'true'
|
| 38 |
+
explanation = parts[1].split(':', 1)[1].strip() if len(parts) > 1 else ""
|
| 39 |
+
return {section: presence}, {section: explanation}
|
| 40 |
+
|
| 41 |
+
def check_cv_sections(text):
|
| 42 |
+
sections = [
|
| 43 |
+
"Personal Information",
|
| 44 |
+
"Summary and objective (About / profile)",
|
| 45 |
+
"Education",
|
| 46 |
+
"Work Experience",
|
| 47 |
+
"Skills",
|
| 48 |
+
"Languages",
|
| 49 |
+
"Certificates",
|
| 50 |
+
"Interests",
|
| 51 |
+
"References (optional)"
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
results = {}
|
| 55 |
+
explanations = {}
|
| 56 |
+
|
| 57 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 58 |
+
future_to_section = {executor.submit(check_cv_section, section, text): section for section in sections}
|
| 59 |
+
for future in concurrent.futures.as_completed(future_to_section):
|
| 60 |
+
section_result, section_explanation = future.result()
|
| 61 |
+
results.update(section_result)
|
| 62 |
+
explanations.update(section_explanation)
|
| 63 |
+
|
| 64 |
+
return results, explanations
|
| 65 |
+
|
| 66 |
+
def calculate_cv_score(sections):
|
| 67 |
+
essentials_sections = {
|
| 68 |
+
"Profile | Summary": 1,
|
| 69 |
+
"Skill|Expertise|Competencies": 4,
|
| 70 |
+
"Education": 5,
|
| 71 |
+
"Projects": 5,
|
| 72 |
+
"Professional experience": 5,
|
| 73 |
+
"Languages": 2
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
return sum(value for essential, value in essentials_sections.items()
|
| 77 |
+
if any(s.lower() in essential.lower() for s in sections if sections[s]))
|
| 78 |
+
|
| 79 |
+
def main():
|
| 80 |
+
st.title('Analyse de CV avec DocTR et Mistral')
|
| 81 |
+
uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
|
| 82 |
+
|
| 83 |
+
if uploaded_file is not None:
|
| 84 |
+
pdf_bytes = uploaded_file.read()
|
| 85 |
+
text = extract_text(pdf_bytes)
|
| 86 |
+
|
| 87 |
+
if st.checkbox("Afficher le texte extrait du CV"):
|
| 88 |
+
st.text_area("Texte extrait du CV", text, height=200)
|
| 89 |
+
|
| 90 |
+
sections, explanations = check_cv_sections(text)
|
| 91 |
+
cv_score = calculate_cv_score(sections)
|
| 92 |
+
|
| 93 |
+
st.header("CV Completeness")
|
| 94 |
+
for section, present in sections.items():
|
| 95 |
+
st.write(f"{section}: {present}")
|
| 96 |
+
if explanations[section]:
|
| 97 |
+
st.write(f"Explanation: {explanations[section]}")
|
| 98 |
+
st.write("---")
|
| 99 |
+
|
| 100 |
+
st.header(f'CV Score: {cv_score}')
|
| 101 |
+
|
| 102 |
+
if __name__ == '__main__':
|
| 103 |
+
main()
|