Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import fitz | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from dotenv import load_dotenv | |
| import google.generativeai as genai | |
| load_dotenv() ## Load all the environment variables | |
| genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) | |
| def extract_text_from_pdf(pdf_content): | |
| """ | |
| Extracts text content from a PDF file. | |
| Parameters: | |
| - pdf_content (bytes): Bytes-like object containing the content of the PDF file. | |
| Returns: | |
| - str: Extracted text content from the PDF file. | |
| """ | |
| text = '' | |
| with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
| temp_file.write(pdf_content) | |
| temp_path = temp_file.name | |
| pdf_document = fitz.open(temp_path) | |
| for page_number in range(pdf_document.page_count): | |
| page = pdf_document[page_number] | |
| text += page.get_text() | |
| pdf_document.close() # Close the PDF document explicitly | |
| os.remove(temp_path) # Remove the temporary file after use | |
| return str(text.replace("\xa0", "")) | |
| def generate_gemini_content(transcript_text): | |
| """ | |
| Generates a summary based on the input text using Google's Gemini Pro model. | |
| Parameters: | |
| - transcript_text (str): Text to be summarized. | |
| Returns: | |
| - str: Generated summary. | |
| """ | |
| prompt = """ | |
| Instructions: | |
| Please provide a concise summary of your relevant experience, skills, | |
| and qualifications in the field of programming and technology. | |
| Highlight your practical experience, technological proficiencies, technical skills, soft skills, | |
| proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields. | |
| Additionally, include your location of residence and any other relevant details related to the programming industry | |
| to facilitate accurate matching with job descriptions. | |
| Example summary: | |
| "Experienced software engineer with proficiency in Python, JavaScript, and Java. | |
| Skilled in developing web applications using React.js and Django frameworks. | |
| Strong problem-solving and communication skills. Located in New York City, | |
| seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects." | |
| CV is : | |
| """ | |
| model = genai.GenerativeModel("gemini-pro") | |
| response = model.generate_content(prompt + transcript_text) | |
| return response.text | |
| def git_indices(data, cv_vect, df_vect): | |
| """ | |
| Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions. | |
| Parameters: | |
| - data (str): Input data. | |
| - cv_vect (numpy.ndarray): Vector representation of the input data. | |
| - df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions. | |
| Returns: | |
| - numpy.ndarray: Indices of job descriptions sorted in descending order of similarity. | |
| """ | |
| for i in range(0, len([data])): | |
| distances = cosine_similarity(cv_vect[i], df_vect).flatten() | |
| indices = np.argsort(distances)[::-1] | |
| return indices | |
| def fit_data(csv_path: str): | |
| """ | |
| Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors. | |
| Parameters: | |
| - csv_path (str): Path to the CSV file containing job descriptions. | |
| Returns: | |
| - pandas.DataFrame: DataFrame containing job descriptions. | |
| - sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object. | |
| - scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions. | |
| """ | |
| df = pd.read_csv(csv_path) | |
| x = df["concatenated_column"] | |
| y = df["label"] | |
| df.drop("concatenated_column", axis=1, inplace=True) | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| vectorizer.fit(x) | |
| df_vect = vectorizer.transform(x) | |
| return df, vectorizer, df_vect | |
| df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") ) | |
| def git_most_similar_job(cv_summarize: str, number_of_jobs: int): | |
| """ | |
| Finds the most similar job descriptions to the input CV summary. | |
| Parameters: | |
| - cv_summarize (str): Summary of the CV. | |
| - number_of_jobs (int): Number of similar job descriptions to return. | |
| Returns: | |
| - pandas.DataFrame: DataFrame containing the most similar job descriptions. | |
| """ | |
| cv_vect = vectorizer.transform([cv_summarize]) | |
| indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect) | |
| prediction_data = df.iloc[indices[:number_of_jobs]] | |
| # Check if all threads have finished | |
| print("ALL Done \n\n") | |
| return prediction_data |