import streamlit as st from doctr.models import ocr_predictor from doctr.io import DocumentFile from huggingface_hub import hf_hub_download from llama_cpp import Llama import concurrent.futures # Download the model (do this only once, outside of any function) @st.cache_resource def load_model(): model_path = hf_hub_download("TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf") return Llama(model_path=model_path, n_ctx=32768, n_gpu_layers=2) # Initialize models llm = load_model() ocr_model = ocr_predictor(pretrained=True) @st.cache_data def extract_text(pdf_bytes): doc = DocumentFile.from_pdf(pdf_bytes) result = ocr_model(doc) return " ".join(word.value for page in result.pages for block in page.blocks for line in block.lines for word in line.words) def check_cv_section(section, text): prompt = f"""Analyze the following CV text and determine if the "{section}" section exists. Respond with 'true' if it exists, or 'false' if it doesn't. Be aware of synonyms and variations in section titles. CV text: {text} Respond in the format: {section}: true/false Explanation: Briefly explain your reasoning, mentioning any relevant keywords or phrases found. """ response = llm(prompt, max_tokens=200) result = response['choices'][0]['text'].strip() parts = result.split('\n') presence = parts[0].split(':')[1].strip().lower() == 'true' explanation = parts[1].split(':', 1)[1].strip() if len(parts) > 1 else "" return {section: presence}, {section: explanation} def check_cv_sections(text): sections = [ "Personal Information", "Summary and objective (About / profile)", "Education", "Work Experience", "Skills", "Languages", "Certificates", "Interests", "References (optional)" ] results = {} explanations = {} with concurrent.futures.ThreadPoolExecutor() as executor: future_to_section = {executor.submit(check_cv_section, section, text): section for section in sections} for future in concurrent.futures.as_completed(future_to_section): section_result, section_explanation = future.result() results.update(section_result) explanations.update(section_explanation) return results, explanations def calculate_cv_score(sections): essentials_sections = { "Profile | Summary": 1, "Skill|Expertise|Competencies": 4, "Education": 5, "Projects": 5, "Professional experience": 5, "Languages": 2 } return sum(value for essential, value in essentials_sections.items() if any(s.lower() in essential.lower() for s in sections if sections[s])) def main(): st.title('Analyse de CV avec DocTR et Mistral') uploaded_file = st.file_uploader("Uploader un fichier PDF", type="pdf") if uploaded_file is not None: pdf_bytes = uploaded_file.read() text = extract_text(pdf_bytes) if st.checkbox("Afficher le texte extrait du CV"): st.text_area("Texte extrait du CV", text, height=200) sections, explanations = check_cv_sections(text) cv_score = calculate_cv_score(sections) st.header("CV Completeness") for section, present in sections.items(): st.write(f"{section}: {present}") if explanations[section]: st.write(f"Explanation: {explanations[section]}") st.write("---") st.header(f'CV Score: {cv_score}') if __name__ == '__main__': main()