Spaces:

SID2702
/

CV_Process

Sleeping

CV_Process / extraction.py

bsiddhharth

added try except to groq_api_key

45fe6a4 over 1 year ago

5.97 kB

	import logging
	from typing import Optional
	from pydantic import BaseModel, Field
	from langchain.prompts import ChatPromptTemplate
	from langchain_groq import ChatGroq
	import os
	import tempfile
	import streamlit as st
	from langchain_community.document_loaders import PDFPlumberLoader, TextLoader


	# logging
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Defining the CV structure using Pydantic for structured output
	class cv(BaseModel):
	name: Optional[str] = Field(default=None, description="Name of candidate")
	skills: Optional[list[str]] = Field(default=None, description="Skills of candidate")
	certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate")
	years_of_exp: Optional[int] = Field(default=None, description="Years of experience")

	# Defining the data structure that contains a list of CVs
	class data(BaseModel):
	candidates: list[cv]

	def create_prompt_template() -> ChatPromptTemplate:

	logger.info("Creating the prompt template for CV extraction")

	"""Create the prompt template for CV extraction."""

	return ChatPromptTemplate.from_messages(
	[
	("system",
	"You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:"
	"- Name of the candidate"
	"- Skills"
	"- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')"
	"- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)"
	"If you cannot find the value for a specific attribute, return null for that attribute's value."
	"The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). "
	"Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. "
	"If multiple jobs are listed, you can calculate the experience from the work history."
	"Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc."
	),
	("human", "{text}")
	]
	)

	def initialize_llm() -> ChatGroq:
	logger.info("Initializing LLM")

	"""Initialize the language model."""

	# os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
	groq_api_key = os.getenv("GROQ_API_KEY")
	if groq_api_key is None:
	try:
	groq_api_key = st.secrets["GROQ_API_KEY"]
	except Exception as e:
	st.error("GROQ_API_KEY is not set in the environment variables or Streamlit secrets.")
	groq_api_key = None
	# groq_api_key = st.secrets["GROQ_API_KEY"]



	if not groq_api_key:
	logger.error("GROQ_API_KEY is not set")
	raise ValueError("GROQ_API_KEY environment variable is missing.")


	return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.6)


	def extract_cv_data(text: str) -> list[cv]:
	logger.info("Extracting CV data from text")

	"""Extract data from the text using the language model."""

	prompt = create_prompt_template()
	llm = initialize_llm()

	# creating a chain to extract structred ouput from the text using schema
	runnable = prompt \| llm.with_structured_output(schema=data)
	response = runnable.invoke({"text": text})

	logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text")

	return response.candidates # returns the list of candidates

	def process_file(uploaded_files) -> str:
	logger.info(f"Processing file: {uploaded_files.name}")

	"""Process the uploaded file and return the text."""

	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file:
	tmp_file.write(uploaded_files.getvalue())
	tmp_path = tmp_file.name
	try:
	if tmp_path.endswith('.pdf'):
	loader = PDFPlumberLoader(tmp_path)
	logger.info(f"Loaded PDF file: {tmp_path}")

	else:
	loader = TextLoader(tmp_path)
	logger.info(f"Loaded text file: {tmp_path}")

	documents = loader.load()
	# return " ".join([doc.page_content for doc in documents])
	text_content = " ".join([doc.page_content for doc in documents])
	logger.info(f"Extracted text from file: {uploaded_files.name}")
	return text_content

	finally:
	logger.info(f"Deleting temporary file: {tmp_path}")
	os.unlink(tmp_path)

	def display_candidates_info(candidates_list: list[cv]):
	logger.info(f"Displaying information for {len(candidates_list)} candidate(s)")

	"""Display the extracted candidates' information in a table."""

	logger.debug(f"Candidate list: {candidates_list}")

	data = []
	for candidate in candidates_list:
	data.append({
	"Name": candidate.name,
	"Skills": ", ".join(candidate.skills) if candidate.skills else 'None',
	"Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None',
	"Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None'
	})

	st.write("### Candidates Information")
	st.table(data)
	logger.debug("Displayed candidates' information in table")
	# print(candidates_list)

	# Try this to see the working of extraction
	# Streamlit file uploader and extraction logic
	# uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload")
	# if uploaded_files is not None:
	# text = process_file(uploaded_files)
	# # text = ep.text
	# candidates_list = extract_cv_data(text)
	# display_candidates_info(candidates_list)