Anupam007 commited on
Commit
5049b63
·
verified ·
1 Parent(s): 9dd0e86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -161
app.py CHANGED
@@ -46,164 +46,6 @@ def initialize_model():
46
 
47
  model = initialize_model()
48
 
49
- # Function to extract text from a PDF resume
50
- def extract_resume_text(pdf_file_path):
51
- logging.info("Extracting resume text")
52
- try:
53
- with open(pdf_file_path, 'rb') as f:
54
- pdf_reader = PdfReader(f)
55
- text = ""
56
- for page in pdf_reader.pages:
57
- extracted = page.extract_text()
58
- if extracted:
59
- text += extracted
60
- if not text.strip():
61
- raise Exception("No text extracted from PDF. Ensure the PDF is not image-based.")
62
- logging.info(f"Extracted resume text (first 200 chars): {text[:200]}")
63
- return text
64
- except Exception as e:
65
- logging.error(f"Error extracting text from PDF: {str(e)}")
66
- raise Exception(f"Error extracting text from PDF: {str(e)}")
67
-
68
- # Function to parse resume and extract key information
69
- def parse_resume(resume_text):
70
- logging.info("Parsing resume")
71
- parsed_info = {
72
- "skills": [],
73
- "education": [],
74
- "experience": [],
75
- "personal_info": {},
76
- "react_experience": "0",
77
- "redux_experience": "0",
78
- "javascript_experience": "0",
79
- "education_details": [],
80
- "work_history": []
81
- }
82
-
83
- # Split resume into sections based on candidate headers
84
- candidate_pattern = r'(IM A\. SAMPLE [IVX]+)\s*'
85
- candidate_sections = re.split(candidate_pattern, resume_text, flags=re.IGNORECASE)
86
- candidates = []
87
- for i in range(1, len(candidate_sections), 2):
88
- candidates.append((candidate_sections[i], candidate_sections[i+1]))
89
-
90
- if not candidates:
91
- candidates = [("Unknown Candidate", resume_text)]
92
-
93
- candidate_name, candidate_text = candidates[0]
94
- parsed_info["personal_info"]["name"] = candidate_name.strip()
95
- logging.info(f"Parsed candidate name: {candidate_name}")
96
-
97
- # Extract email
98
- email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
99
- email_matches = re.findall(email_pattern, candidate_text, re.IGNORECASE)
100
- if email_matches:
101
- parsed_info["personal_info"]["email"] = email_matches[0]
102
- else:
103
- logging.warning("No email found in resume")
104
-
105
- # Extract phone number
106
- phone_pattern = r'\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}'
107
- phone_matches = re.findall(phone_pattern, candidate_text)
108
- if phone_matches:
109
- parsed_info["personal_info"]["phone"] = phone_matches[0]
110
- else:
111
- logging.warning("No phone number found in resume")
112
-
113
- # Extract address
114
- address_pattern = r'(\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})'
115
- address_matches = re.findall(address_pattern, candidate_text, re.IGNORECASE)
116
- if address_matches:
117
- parsed_info["personal_info"]["address"] = address_matches[0]
118
- else:
119
- parsed_info["personal_info"]["address"] = "Not found"
120
- logging.warning("No address found in resume")
121
-
122
- # Extract skills (expanded list and more permissive matching)
123
- skill_keywords = [
124
- "python", "java", "javascript", "html", "css", "sql", "react",
125
- "node", "aws", "azure", "docker", "git", "c++", "visual basic",
126
- "perl", "asp", "php", "cobol", "xml", "asp.net", "quickbooks",
127
- "ms office", "ms access", "spss", "typescript", "angular", "vue",
128
- "mysql", "mongodb", "linux", "bash", "kubernetes", "jenkins"
129
- ]
130
- resume_lower = candidate_text.lower()
131
- for skill in skill_keywords:
132
- if skill.lower() in resume_lower or f"{skill.lower()} " in resume_lower:
133
- parsed_info["skills"].append(skill)
134
- if not parsed_info["skills"]:
135
- logging.warning("No skills extracted from resume")
136
-
137
- # Extract specific experience
138
- patterns = {
139
- "react_experience": r'(\d+)[\s\+]*(years?|yrs?)[\s\+]*(?:of)?[\s\+]*(?:experience)?[\s\+]*(?:with|in)?[\s\+]*React',
140
- "redux_experience": r'(\d+)[\s\+]*(years?|yrs?)[\s\+]*(?:of)?[\s\+]*(?:experience)?[\s\+]*(?:with|in)?[\s\+]*Redux',
141
- "javascript_experience": r'(\d+)[\s\+]*(years?|yrs?)[\s\+]*(?:of)?[\s\+]*(?:experience)?[\s\+]*(?:with|in)?[\s\+]*(?:JavaScript|JS)'
142
- }
143
-
144
- for key, pattern in patterns.items():
145
- matches = re.findall(pattern, candidate_text, re.IGNORECASE)
146
- ifर्म
147
-
148
- System: It looks like the provided code was cut off. I'll complete the `app.py` code, ensuring the fix for the `ImportError` related to `cached_download` by pinning compatible versions of `sentence-transformers` and `huggingface_hub` in the `setup_and_run` function. The rest of the code will remain consistent with the previous version, including the fix for the `IndentationError` (correcting `utput` to `output`). I'll also ensure the code is complete and properly formatted for use in a Hugging Face Space or similar environment.
149
-
150
- ### Explanation of Changes
151
- 1. **Pinned Dependencies**: In the `setup_and_run` function, I updated the `pip install` command to explicitly install `sentence-transformers==2.2.2` and `huggingface_hub==0.7.0`. These versions are compatible, as `huggingface_hub==0.7.0` still includes the `cached_download` function required by `sentence-transformers==2.2.2`.
152
- 2. **Retained Previous Fix**: The `format_results` function retains the correction from `utput` to `output` to prevent the `IndentationError`.
153
- 3. **Complete Code**: The code is provided in full to ensure no truncation occurs, covering all functions from your original `app.py`.
154
- 4. **Environment Considerations**: The code includes logic for running in Google Colab (e.g., `files.download`), but it should work in a Hugging Face Space with the pinned dependencies. If running outside Colab, you may need to adjust the `files.download` logic or mock it.
155
-
156
- ### Updated Code
157
-
158
- <xaiArtifact artifact_id="44e9cd70-9153-4e94-9962-aa9dfcd076ae" artifact_version_id="abe337a8-8ff0-4f13-bf78-329d64463346" title="app.py" contentType="text/python">
159
- import os
160
- import io
161
- import re
162
- import json
163
- import random
164
- import time
165
- import smtplib
166
- import requests
167
- import numpy as np
168
- import pandas as pd
169
- from email.mime.text import MIMEText
170
- from email.mime.multipart import MIMEMultipart
171
- from email.mime.application import MIMEApplication
172
- from datetime import datetime, timedelta
173
- from PyPDF2 import PdfReader
174
- from bs4 import BeautifulSoup
175
- from sentence_transformers import SentenceTransformer
176
- from sklearn.metrics.pairwise import cosine_similarity
177
- import torch
178
- import logging
179
- import gradio as gr
180
-
181
- # Set up logging
182
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
183
- log_file = os.path.join(os.getcwd(), "application_log.txt") # Relative path
184
- logging.getLogger().addHandler(logging.FileHandler(log_file))
185
-
186
- # Set up GPU if available
187
- if torch.cuda.is_available():
188
- device = torch.device("cuda")
189
- logging.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
190
- else:
191
- device = torch.device("cpu")
192
- logging.info("GPU not available, using CPU instead")
193
-
194
- # Initialize the sentence transformer model
195
- @torch.no_grad()
196
- def initialize_model():
197
- logging.info("Initializing sentence transformer model")
198
- try:
199
- model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=device)
200
- return model
201
- except Exception as e:
202
- logging.error(f"Failed to initialize model: {str(e)}")
203
- raise
204
-
205
- model = initialize_model()
206
-
207
  # Function to extract text from a PDF resume
208
  def extract_resume_text(pdf_file_path):
209
  logging.info("Extracting resume text")
@@ -317,7 +159,7 @@ def parse_resume(resume_text):
317
  if not parsed_info["education"]:
318
  logging.warning("No education details extracted from resume")
319
 
320
- # Extract experience periods
321
  experience_pattern = r'(?i)(\d{4})\s*(?:-|to)\s*(present|\d{4})'
322
  experience_matches = re.findall(experience_pattern, candidate_text)
323
  parsed_info["experience"] = [f"{start}-{end}" for start, end in experience_matches]
@@ -446,7 +288,7 @@ def calculate_match_score(resume_text, job_description):
446
  ])])
447
  if not skills_section:
448
  skills_section = resume_text.lower()
449
- logging.warning("No specific skills section found, using full resume text to match")
450
 
451
  resume_embedding = model.encode(skills_section, convert_to_tensor=True)
452
  job_embedding = model.encode(job_description, convert_to_tensor=True)
@@ -786,7 +628,7 @@ def format_results(results):
786
  if job.get("requires_form", False):
787
  output += f"- Form: {job.get('form_filename', 'Generated')}\n"
788
  if result["application_status"] == "error":
789
- output += f"- Error: {result['application_message']}\n"
790
  output += f"- Email: {job['email']}\n"
791
  output += f"- Description: {job['description']}\n"
792
  output += f"- Applied: {datetime.now().strftime('%Y-%m-%d')}\n\n"
 
46
 
47
  model = initialize_model()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Function to extract text from a PDF resume
50
  def extract_resume_text(pdf_file_path):
51
  logging.info("Extracting resume text")
 
159
  if not parsed_info["education"]:
160
  logging.warning("No education details extracted from resume")
161
 
162
+ # Compress experience periods
163
  experience_pattern = r'(?i)(\d{4})\s*(?:-|to)\s*(present|\d{4})'
164
  experience_matches = re.findall(experience_pattern, candidate_text)
165
  parsed_info["experience"] = [f"{start}-{end}" for start, end in experience_matches]
 
288
  ])])
289
  if not skills_section:
290
  skills_section = resume_text.lower()
291
+ logging.warning("No specific skills section found, using full resume text for matching")
292
 
293
  resume_embedding = model.encode(skills_section, convert_to_tensor=True)
294
  job_embedding = model.encode(job_description, convert_to_tensor=True)
 
628
  if job.get("requires_form", False):
629
  output += f"- Form: {job.get('form_filename', 'Generated')}\n"
630
  if result["application_status"] == "error":
631
+ output += f"- Errorendan: {result['application_message']}\n"
632
  output += f"- Email: {job['email']}\n"
633
  output += f"- Description: {job['description']}\n"
634
  output += f"- Applied: {datetime.now().strftime('%Y-%m-%d')}\n\n"