intelliCV / app.py
Arghya Ghosh
Update app.py
9a16da9 verified
import streamlit as st
import pickle
import docx
import PyPDF2
import re
from huggingface_hub import hf_hub_download
from io import BytesIO
# Set page configuration
st.set_page_config(
page_title="IntelliCV: AI Resume Analyzer",
page_icon="πŸ“„",
layout="centered",
initial_sidebar_state="expanded"
)
# Add some basic CSS styling directly in the app
st.markdown("""
<style>
.result-card {
padding: 20px;
border-radius: 10px;
background-color: #f0f2f6;
margin-bottom: 20px;
}
.category {
font-size: 24px;
font-weight: bold;
color: #2e86c1;
}
.stSpinner > div {
text-align: center;
align-items: center;
}
</style>
""", unsafe_allow_html=True)
# Cache model loading with improved error handling
@st.cache_resource(show_spinner="Loading AI models...")
def load_models():
try:
repo_id = "psychomita/intellicv-models"
clf_path = hf_hub_download(repo_id=repo_id, filename="clf.pkl", repo_type="model")
tfidf_path = hf_hub_download(repo_id=repo_id, filename="tfidf.pkl", repo_type="model")
encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pkl", repo_type="model")
svc_model = pickle.load(open(clf_path, 'rb'))
tfidf = pickle.load(open(tfidf_path, 'rb'))
le = pickle.load(open(encoder_path, 'rb'))
return svc_model, tfidf, le
except Exception as e:
st.error(f"Failed to load models: {str(e)}")
return None, None, None
# Load models
svc_model, tfidf, le = load_models()
def cleanResume(txt):
"""Improved text cleaning function with more comprehensive patterns"""
if not isinstance(txt, str):
return ""
clean_patterns = [
(r'http\S+|www\S+|https\S+', ' '), # URLs
(r'\bRT\b|\bretweet\b', ' '), # Retweets
(r'#\S+', ' '), # Hashtags
(r'@\S+', ' '), # Mentions
(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' '), # Punctuation
(r'[^\x00-\x7f]', ' '), # Non-ASCII chars
(r'\s+', ' '), # Extra whitespace
(r'\b\d+\b', ' '), # Standalone numbers
(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ') # Email addresses
]
cleanText = txt
for pattern, repl in clean_patterns:
cleanText = re.sub(pattern, repl, cleanText)
return cleanText.strip()
def extract_text_from_pdf(file):
"""Improved PDF text extraction with error handling"""
try:
pdf_reader = PyPDF2.PdfReader(BytesIO(file.read()))
text = []
for page in pdf_reader.pages:
try:
page_text = page.extract_text() or ''
text.append(page_text)
except Exception as e:
st.warning(f"Could not extract text from one page: {str(e)}")
continue
return ' '.join(text)
except Exception as e:
raise ValueError(f"Failed to read PDF file: {str(e)}")
def extract_text_from_docx(file):
"""Improved DOCX text extraction with error handling"""
try:
doc = docx.Document(BytesIO(file.read()))
return '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip())
except Exception as e:
raise ValueError(f"Failed to read DOCX file: {str(e)}")
def extract_text_from_txt(file):
"""Improved text file extraction with multiple encoding attempts"""
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252']
for encoding in encodings:
try:
return BytesIO(file.read()).read().decode(encoding)
except UnicodeDecodeError:
file.seek(0)
raise ValueError("Failed to decode text file with common encodings")
def handle_file_upload(uploaded_file):
"""Handle file upload with better type checking"""
if not uploaded_file:
return None
file_extension = uploaded_file.name.split('.')[-1].lower()
try:
if file_extension == 'pdf':
return extract_text_from_pdf(uploaded_file)
elif file_extension == 'docx':
return extract_text_from_docx(uploaded_file)
elif file_extension == 'txt':
return extract_text_from_txt(uploaded_file)
else:
raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")
except Exception as e:
raise ValueError(f"Error processing file: {str(e)}")
def predict_category(input_resume):
"""Make prediction with input validation"""
if not input_resume or not isinstance(input_resume, str):
return "Unknown"
try:
cleaned_text = cleanResume(input_resume)
if not cleaned_text.strip():
return "Unknown (insufficient text)"
vectorized_text = tfidf.transform([cleaned_text]).toarray()
predicted_category = svc_model.predict(vectorized_text)
return le.inverse_transform(predicted_category)[0]
except Exception as e:
st.error(f"Prediction error: {str(e)}")
return "Unknown (prediction failed)"
def display_results(resume_text, category):
"""Display results in a more engaging way"""
st.subheader("Analysis Results")
# Category display with emoji
category_emojis = {
"Data Science": "πŸ“Š",
"HR": "πŸ‘₯",
"Design": "🎨",
"Information Technology": "πŸ’»",
"Education": "πŸ“š",
"Business Development": "πŸ“ˆ",
"Marketing": "πŸ“’",
"Sales": "πŸ’°",
"Health and Fitness": "πŸ’ͺ",
"Engineering": "βš™οΈ"
}
emoji = category_emojis.get(category, "πŸ”")
st.markdown(f"""
<div class="result-card">
<h3>Predicted Job Category</h3>
<p class="category">{emoji} {category}</p>
</div>
""", unsafe_allow_html=True)
# Text analysis section
with st.expander("Text Analysis Details"):
st.markdown("**Cleaned Text Excerpt:**")
cleaned_text = cleanResume(resume_text)
st.text(cleaned_text[:500] + "..." if len(cleaned_text) > 500 else cleaned_text)
st.markdown("**Statistics:**")
col1, col2, col3 = st.columns(3)
col1.metric("Original Length", f"{len(resume_text):,} chars")
col2.metric("Cleaned Length", f"{len(cleaned_text):,} chars")
col3.metric("Reduction", f"{100 - (len(cleaned_text)/len(resume_text)*100 if resume_text else 0):.1f}%")
def main():
"""Main application function with improved UI"""
st.title("πŸ“„ IntelliCV: AI-Powered Resume Analyzer")
st.markdown("""
Upload your resume and discover which job category it best matches with our AI analysis.
Supported formats: PDF, DOCX, and TXT.
""")
# Sidebar with additional info
with st.sidebar:
st.header("About")
st.markdown("""
IntelliCV uses machine learning to analyze resume content and predict the most suitable job category.
**How it works:**
1. Upload your resume
2. AI extracts and cleans the text
3. Our model predicts the job category
4. View detailed analysis
""")
st.markdown("---")
st.markdown("""
**Common Categories:**
- Data Science
- HR
- Design
- Information Technology
- Education
- And more...
""")
# File upload section
uploaded_file = st.file_uploader(
"Upload your resume",
type=["pdf", "docx", "txt"],
help="Supported formats: PDF, DOCX, TXT (max 10MB)"
)
if uploaded_file is not None:
# Check file size
if uploaded_file.size > 10 * 1024 * 1024: # 10MB limit
st.error("File size exceeds 10MB limit. Please upload a smaller file.")
return
with st.spinner("πŸ” Analyzing your resume..."):
try:
resume_text = handle_file_upload(uploaded_file)
if not resume_text.strip():
st.warning("The uploaded file appears to be empty or couldn't be read properly.")
return
category = predict_category(resume_text)
display_results(resume_text, category)
# Download button for cleaned text
st.download_button(
label="Download Cleaned Text",
data=resume_text,
file_name="cleaned_resume.txt",
mime="text/plain"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.info("Please try another file or check the format.")
if __name__ == "__main__":
if svc_model and tfidf and le:
main()
else:
st.error("Failed to load required models. Please try again later.")