Spaces:
Sleeping
Sleeping
| import logging | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain_groq import ChatGroq | |
| import os | |
| import tempfile | |
| import streamlit as st | |
| from langchain_community.document_loaders import PDFPlumberLoader, TextLoader | |
| # logging | |
| logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Defining the CV structure using Pydantic for structured output | |
| class cv(BaseModel): | |
| name: Optional[str] = Field(default=None, description="Name of candidate") | |
| skills: Optional[list[str]] = Field(default=None, description="Skills of candidate") | |
| certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate") | |
| years_of_exp: Optional[int] = Field(default=None, description="Years of experience") | |
| # Defining the data structure that contains a list of CVs | |
| class data(BaseModel): | |
| candidates: list[cv] | |
| def create_prompt_template() -> ChatPromptTemplate: | |
| logger.info("Creating the prompt template for CV extraction") | |
| """Create the prompt template for CV extraction.""" | |
| return ChatPromptTemplate.from_messages( | |
| [ | |
| ("system", | |
| "You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:" | |
| "- Name of the candidate" | |
| "- Skills" | |
| "- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')" | |
| "- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)" | |
| "If you cannot find the value for a specific attribute, return null for that attribute's value." | |
| "The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). " | |
| "Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. " | |
| "If multiple jobs are listed, you can calculate the experience from the work history." | |
| "Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc." | |
| ), | |
| ("human", "{text}") | |
| ] | |
| ) | |
| def initialize_llm() -> ChatGroq: | |
| logger.info("Initializing LLM") | |
| """Initialize the language model.""" | |
| # os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY") | |
| groq_api_key = os.getenv("GROQ_API_KEY") | |
| if groq_api_key is None: | |
| try: | |
| groq_api_key = st.secrets["GROQ_API_KEY"] | |
| except Exception as e: | |
| st.error("GROQ_API_KEY is not set in the environment variables or Streamlit secrets.") | |
| groq_api_key = None | |
| # groq_api_key = st.secrets["GROQ_API_KEY"] | |
| if not groq_api_key: | |
| logger.error("GROQ_API_KEY is not set") | |
| raise ValueError("GROQ_API_KEY environment variable is missing.") | |
| return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.6) | |
| def extract_cv_data(text: str) -> list[cv]: | |
| logger.info("Extracting CV data from text") | |
| """Extract data from the text using the language model.""" | |
| prompt = create_prompt_template() | |
| llm = initialize_llm() | |
| # creating a chain to extract structred ouput from the text using schema | |
| runnable = prompt | llm.with_structured_output(schema=data) | |
| response = runnable.invoke({"text": text}) | |
| logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text") | |
| return response.candidates # returns the list of candidates | |
| def process_file(uploaded_files) -> str: | |
| logger.info(f"Processing file: {uploaded_files.name}") | |
| """Process the uploaded file and return the text.""" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file: | |
| tmp_file.write(uploaded_files.getvalue()) | |
| tmp_path = tmp_file.name | |
| try: | |
| if tmp_path.endswith('.pdf'): | |
| loader = PDFPlumberLoader(tmp_path) | |
| logger.info(f"Loaded PDF file: {tmp_path}") | |
| else: | |
| loader = TextLoader(tmp_path) | |
| logger.info(f"Loaded text file: {tmp_path}") | |
| documents = loader.load() | |
| # return " ".join([doc.page_content for doc in documents]) | |
| text_content = " ".join([doc.page_content for doc in documents]) | |
| logger.info(f"Extracted text from file: {uploaded_files.name}") | |
| return text_content | |
| finally: | |
| logger.info(f"Deleting temporary file: {tmp_path}") | |
| os.unlink(tmp_path) | |
| def display_candidates_info(candidates_list: list[cv]): | |
| logger.info(f"Displaying information for {len(candidates_list)} candidate(s)") | |
| """Display the extracted candidates' information in a table.""" | |
| logger.debug(f"Candidate list: {candidates_list}") | |
| data = [] | |
| for candidate in candidates_list: | |
| data.append({ | |
| "Name": candidate.name, | |
| "Skills": ", ".join(candidate.skills) if candidate.skills else 'None', | |
| "Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None', | |
| "Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None' | |
| }) | |
| st.write("### Candidates Information") | |
| st.table(data) | |
| logger.debug("Displayed candidates' information in table") | |
| # print(candidates_list) | |
| # Try this to see the working of extraction | |
| # Streamlit file uploader and extraction logic | |
| # uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload") | |
| # if uploaded_files is not None: | |
| # text = process_file(uploaded_files) | |
| # # text = ep.text | |
| # candidates_list = extract_cv_data(text) | |
| # display_candidates_info(candidates_list) | |