intelliCV

Sleeping

Arghya Ghosh

Update app.py

9a16da9 verified 11 months ago

9.04 kB

	import streamlit as st
	import pickle
	import docx
	import PyPDF2
	import re
	from huggingface_hub import hf_hub_download
	from io import BytesIO

	# Set page configuration
	st.set_page_config(
	page_title="IntelliCV: AI Resume Analyzer",
	page_icon="📄",
	layout="centered",
	initial_sidebar_state="expanded"
	)

	# Add some basic CSS styling directly in the app
	st.markdown("""
	<style>
	.result-card {
	padding: 20px;
	border-radius: 10px;
	background-color: #f0f2f6;
	margin-bottom: 20px;
	}
	.category {
	font-size: 24px;
	font-weight: bold;
	color: #2e86c1;
	}
	.stSpinner > div {
	text-align: center;
	align-items: center;
	}
	</style>
	""", unsafe_allow_html=True)

	# Cache model loading with improved error handling
	@st.cache_resource(show_spinner="Loading AI models...")
	def load_models():
	try:
	repo_id = "psychomita/intellicv-models"
	clf_path = hf_hub_download(repo_id=repo_id, filename="clf.pkl", repo_type="model")
	tfidf_path = hf_hub_download(repo_id=repo_id, filename="tfidf.pkl", repo_type="model")
	encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pkl", repo_type="model")

	svc_model = pickle.load(open(clf_path, 'rb'))
	tfidf = pickle.load(open(tfidf_path, 'rb'))
	le = pickle.load(open(encoder_path, 'rb'))

	return svc_model, tfidf, le
	except Exception as e:
	st.error(f"Failed to load models: {str(e)}")
	return None, None, None

	# Load models
	svc_model, tfidf, le = load_models()

	def cleanResume(txt):
	"""Improved text cleaning function with more comprehensive patterns"""
	if not isinstance(txt, str):
	return ""

	clean_patterns = [
	(r'http\S+\|www\S+\|https\S+', ' '), # URLs
	(r'\bRT\b\|\bretweet\b', ' '), # Retweets
	(r'#\S+', ' '), # Hashtags
	(r'@\S+', ' '), # Mentions
	(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{\|}~"""), ' '), # Punctuation
	(r'[^\x00-\x7f]', ' '), # Non-ASCII chars
	(r'\s+', ' '), # Extra whitespace
	(r'\b\d+\b', ' '), # Standalone numbers
	(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ') # Email addresses
	]

	cleanText = txt
	for pattern, repl in clean_patterns:
	cleanText = re.sub(pattern, repl, cleanText)

	return cleanText.strip()

	def extract_text_from_pdf(file):
	"""Improved PDF text extraction with error handling"""
	try:
	pdf_reader = PyPDF2.PdfReader(BytesIO(file.read()))
	text = []
	for page in pdf_reader.pages:
	try:
	page_text = page.extract_text() or ''
	text.append(page_text)
	except Exception as e:
	st.warning(f"Could not extract text from one page: {str(e)}")
	continue
	return ' '.join(text)
	except Exception as e:
	raise ValueError(f"Failed to read PDF file: {str(e)}")

	def extract_text_from_docx(file):
	"""Improved DOCX text extraction with error handling"""
	try:
	doc = docx.Document(BytesIO(file.read()))
	return '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip())
	except Exception as e:
	raise ValueError(f"Failed to read DOCX file: {str(e)}")

	def extract_text_from_txt(file):
	"""Improved text file extraction with multiple encoding attempts"""
	encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252']
	for encoding in encodings:
	try:
	return BytesIO(file.read()).read().decode(encoding)
	except UnicodeDecodeError:
	file.seek(0)
	raise ValueError("Failed to decode text file with common encodings")

	def handle_file_upload(uploaded_file):
	"""Handle file upload with better type checking"""
	if not uploaded_file:
	return None

	file_extension = uploaded_file.name.split('.')[-1].lower()
	try:
	if file_extension == 'pdf':
	return extract_text_from_pdf(uploaded_file)
	elif file_extension == 'docx':
	return extract_text_from_docx(uploaded_file)
	elif file_extension == 'txt':
	return extract_text_from_txt(uploaded_file)
	else:
	raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")
	except Exception as e:
	raise ValueError(f"Error processing file: {str(e)}")

	def predict_category(input_resume):
	"""Make prediction with input validation"""
	if not input_resume or not isinstance(input_resume, str):
	return "Unknown"

	try:
	cleaned_text = cleanResume(input_resume)
	if not cleaned_text.strip():
	return "Unknown (insufficient text)"

	vectorized_text = tfidf.transform([cleaned_text]).toarray()
	predicted_category = svc_model.predict(vectorized_text)
	return le.inverse_transform(predicted_category)[0]
	except Exception as e:
	st.error(f"Prediction error: {str(e)}")
	return "Unknown (prediction failed)"

	def display_results(resume_text, category):
	"""Display results in a more engaging way"""
	st.subheader("Analysis Results")

	# Category display with emoji
	category_emojis = {
	"Data Science": "📊",
	"HR": "👥",
	"Design": "🎨",
	"Information Technology": "💻",
	"Education": "📚",
	"Business Development": "📈",
	"Marketing": "📢",
	"Sales": "💰",
	"Health and Fitness": "💪",
	"Engineering": "⚙️"
	}

	emoji = category_emojis.get(category, "🔍")
	st.markdown(f"""
	<div class="result-card">
	<h3>Predicted Job Category</h3>
	<p class="category">{emoji} {category}</p>
	</div>
	""", unsafe_allow_html=True)

	# Text analysis section
	with st.expander("Text Analysis Details"):
	st.markdown("Cleaned Text Excerpt:")
	cleaned_text = cleanResume(resume_text)
	st.text(cleaned_text[:500] + "..." if len(cleaned_text) > 500 else cleaned_text)

	st.markdown("Statistics:")
	col1, col2, col3 = st.columns(3)
	col1.metric("Original Length", f"{len(resume_text):,} chars")
	col2.metric("Cleaned Length", f"{len(cleaned_text):,} chars")
	col3.metric("Reduction", f"{100 - (len(cleaned_text)/len(resume_text)*100 if resume_text else 0):.1f}%")

	def main():
	"""Main application function with improved UI"""
	st.title("📄 IntelliCV: AI-Powered Resume Analyzer")
	st.markdown("""
	Upload your resume and discover which job category it best matches with our AI analysis.
	Supported formats: PDF, DOCX, and TXT.
	""")

	# Sidebar with additional info
	with st.sidebar:
	st.header("About")
	st.markdown("""
	IntelliCV uses machine learning to analyze resume content and predict the most suitable job category.

	How it works:
	1. Upload your resume
	2. AI extracts and cleans the text
	3. Our model predicts the job category
	4. View detailed analysis
	""")

	st.markdown("---")
	st.markdown("""
	Common Categories:
	- Data Science
	- HR
	- Design
	- Information Technology
	- Education
	- And more...
	""")

	# File upload section
	uploaded_file = st.file_uploader(
	"Upload your resume",
	type=["pdf", "docx", "txt"],
	help="Supported formats: PDF, DOCX, TXT (max 10MB)"
	)

	if uploaded_file is not None:
	# Check file size
	if uploaded_file.size > 10 * 1024 * 1024: # 10MB limit
	st.error("File size exceeds 10MB limit. Please upload a smaller file.")
	return

	with st.spinner("🔍 Analyzing your resume..."):
	try:
	resume_text = handle_file_upload(uploaded_file)
	if not resume_text.strip():
	st.warning("The uploaded file appears to be empty or couldn't be read properly.")
	return

	category = predict_category(resume_text)
	display_results(resume_text, category)

	# Download button for cleaned text
	st.download_button(
	label="Download Cleaned Text",
	data=resume_text,
	file_name="cleaned_resume.txt",
	mime="text/plain"
	)

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	st.info("Please try another file or check the format.")

	if __name__ == "__main__":
	if svc_model and tfidf and le:
	main()
	else:
	st.error("Failed to load required models. Please try again later.")