Spaces:

Nassiraaa
/

COMPLETNESS

Sleeping

File size: 3,640 Bytes

911e595

import streamlit as st
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import concurrent.futures

# Download the model (do this only once, outside of any function)
@st.cache_resource
def load_model():
    model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf")
    return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2)

# Initialize models
llm = load_model()
ocr_model = ocr_predictor(pretrained=True)

@st.cache_data
def extract_text(pdf_bytes):
    doc = DocumentFile.from_pdf(pdf_bytes)
    result = ocr_model(doc)
    return " ".join(word.value for page in result.pages for block in page.blocks for line in block.lines for word in line.words)

def check_cv_section(section, text):
    prompt = f"""Analyze the following CV text and determine if the "{section}" section exists.
Respond with 'true' if it exists, or 'false' if it doesn't.
Be aware of synonyms and variations in section titles.
CV text:
{text}
Respond in the format:
{section}: true/false
Explanation: Briefly explain your reasoning, mentioning any relevant keywords or phrases found.
"""
    response = llm(prompt, max_tokens=200)
    result = response['choices'][0]['text'].strip()
    parts = result.split('\n')
    presence = parts[0].split(':')[1].strip().lower() == 'true'
    explanation = parts[1].split(':', 1)[1].strip() if len(parts) > 1 else ""
    return {section: presence}, {section: explanation}

def check_cv_sections(text):
    sections = [
        "Personal Information",
        "Summary and objective (About / profile)",
        "Education",
        "Work Experience",
        "Skills",
        "Languages",
        "Certificates",
        "Interests",
        "References (optional)"
    ]
    
    results = {}
    explanations = {}
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_section = {executor.submit(check_cv_section, section, text): section for section in sections}
        for future in concurrent.futures.as_completed(future_to_section):
            section_result, section_explanation = future.result()
            results.update(section_result)
            explanations.update(section_explanation)
    
    return results, explanations

def calculate_cv_score(sections):
    essentials_sections = {
        "Profile | Summary": 1,
        "Skill|Expertise|Competencies": 4,
        "Education": 5,
        "Projects": 5,
        "Professional experience": 5,
        "Languages": 2
    }
    
    return sum(value for essential, value in essentials_sections.items() 
               if any(s.lower() in essential.lower() for s in sections if sections[s]))

def main():
    st.title('Analyse de CV avec DocTR et Mistral')
    uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf")
    
    if uploaded_file is not None:
        pdf_bytes = uploaded_file.read()
        text = extract_text(pdf_bytes)
        
        if st.checkbox("Afficher le texte extrait du CV"):
            st.text_area("Texte extrait du CV", text, height=200)
        
        sections, explanations = check_cv_sections(text)
        cv_score = calculate_cv_score(sections)
        
        st.header("CV Completeness")
        for section, present in sections.items():
            st.write(f"{section}: {present}")
            if explanations[section]:
                st.write(f"Explanation: {explanations[section]}")
            st.write("---")
        
        st.header(f'CV Score: {cv_score}')

if __name__ == '__main__':
    main()