intelliCV

Sleeping

File size: 9,036 Bytes

import streamlit as st
import pickle
import docx
import PyPDF2
import re
from huggingface_hub import hf_hub_download
from io import BytesIO

# Set page configuration
st.set_page_config(
    page_title="IntelliCV: AI Resume Analyzer",
    page_icon="📄",
    layout="centered",
    initial_sidebar_state="expanded"
)

# Add some basic CSS styling directly in the app
st.markdown("""
<style>
    .result-card {
        padding: 20px;
        border-radius: 10px;
        background-color: #f0f2f6;
        margin-bottom: 20px;
    }
    .category {
        font-size: 24px;
        font-weight: bold;
        color: #2e86c1;
    }
    .stSpinner > div {
        text-align: center;
        align-items: center;
    }
</style>
""", unsafe_allow_html=True)

# Cache model loading with improved error handling
@st.cache_resource(show_spinner="Loading AI models...")
def load_models():
    try:
        repo_id = "psychomita/intellicv-models"
        clf_path = hf_hub_download(repo_id=repo_id, filename="clf.pkl", repo_type="model")
        tfidf_path = hf_hub_download(repo_id=repo_id, filename="tfidf.pkl", repo_type="model")
        encoder_path = hf_hub_download(repo_id=repo_id, filename="encoder.pkl", repo_type="model")
        
        svc_model = pickle.load(open(clf_path, 'rb'))
        tfidf = pickle.load(open(tfidf_path, 'rb'))
        le = pickle.load(open(encoder_path, 'rb'))
        
        return svc_model, tfidf, le
    except Exception as e:
        st.error(f"Failed to load models: {str(e)}")
        return None, None, None

# Load models
svc_model, tfidf, le = load_models()

def cleanResume(txt):
    """Improved text cleaning function with more comprehensive patterns"""
    if not isinstance(txt, str):
        return ""
    
    clean_patterns = [
        (r'http\S+|www\S+|https\S+', ' '),  # URLs
        (r'\bRT\b|\bretweet\b', ' '),        # Retweets
        (r'#\S+', ' '),                       # Hashtags
        (r'@\S+', ' '),                       # Mentions
        (r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' '),  # Punctuation
        (r'[^\x00-\x7f]', ' '),              # Non-ASCII chars
        (r'\s+', ' '),                        # Extra whitespace
        (r'\b\d+\b', ' '),                    # Standalone numbers
        (r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ') # Email addresses
    ]
    
    cleanText = txt
    for pattern, repl in clean_patterns:
        cleanText = re.sub(pattern, repl, cleanText)
    
    return cleanText.strip()

def extract_text_from_pdf(file):
    """Improved PDF text extraction with error handling"""
    try:
        pdf_reader = PyPDF2.PdfReader(BytesIO(file.read()))
        text = []
        for page in pdf_reader.pages:
            try:
                page_text = page.extract_text() or ''
                text.append(page_text)
            except Exception as e:
                st.warning(f"Could not extract text from one page: {str(e)}")
                continue
        return ' '.join(text)
    except Exception as e:
        raise ValueError(f"Failed to read PDF file: {str(e)}")

def extract_text_from_docx(file):
    """Improved DOCX text extraction with error handling"""
    try:
        doc = docx.Document(BytesIO(file.read()))
        return '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip())
    except Exception as e:
        raise ValueError(f"Failed to read DOCX file: {str(e)}")

def extract_text_from_txt(file):
    """Improved text file extraction with multiple encoding attempts"""
    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252']
    for encoding in encodings:
        try:
            return BytesIO(file.read()).read().decode(encoding)
        except UnicodeDecodeError:
            file.seek(0)
    raise ValueError("Failed to decode text file with common encodings")

def handle_file_upload(uploaded_file):
    """Handle file upload with better type checking"""
    if not uploaded_file:
        return None
        
    file_extension = uploaded_file.name.split('.')[-1].lower()
    try:
        if file_extension == 'pdf':
            return extract_text_from_pdf(uploaded_file)
        elif file_extension == 'docx':
            return extract_text_from_docx(uploaded_file)
        elif file_extension == 'txt':
            return extract_text_from_txt(uploaded_file)
        else:
            raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")
    except Exception as e:
        raise ValueError(f"Error processing file: {str(e)}")

def predict_category(input_resume):
    """Make prediction with input validation"""
    if not input_resume or not isinstance(input_resume, str):
        return "Unknown"
    
    try:
        cleaned_text = cleanResume(input_resume)
        if not cleaned_text.strip():
            return "Unknown (insufficient text)"
            
        vectorized_text = tfidf.transform([cleaned_text]).toarray()
        predicted_category = svc_model.predict(vectorized_text)
        return le.inverse_transform(predicted_category)[0]
    except Exception as e:
        st.error(f"Prediction error: {str(e)}")
        return "Unknown (prediction failed)"

def display_results(resume_text, category):
    """Display results in a more engaging way"""
    st.subheader("Analysis Results")
    
    # Category display with emoji
    category_emojis = {
        "Data Science": "📊",
        "HR": "👥",
        "Design": "🎨",
        "Information Technology": "💻",
        "Education": "📚",
        "Business Development": "📈",
        "Marketing": "📢",
        "Sales": "💰",
        "Health and Fitness": "💪",
        "Engineering": "⚙️"
    }
    
    emoji = category_emojis.get(category, "🔍")
    st.markdown(f"""
    <div class="result-card">
        <h3>Predicted Job Category</h3>
        <p class="category">{emoji} {category}</p>
    </div>
    """, unsafe_allow_html=True)
    
    # Text analysis section
    with st.expander("Text Analysis Details"):
        st.markdown("**Cleaned Text Excerpt:**")
        cleaned_text = cleanResume(resume_text)
        st.text(cleaned_text[:500] + "..." if len(cleaned_text) > 500 else cleaned_text)
        
        st.markdown("**Statistics:**")
        col1, col2, col3 = st.columns(3)
        col1.metric("Original Length", f"{len(resume_text):,} chars")
        col2.metric("Cleaned Length", f"{len(cleaned_text):,} chars")
        col3.metric("Reduction", f"{100 - (len(cleaned_text)/len(resume_text)*100 if resume_text else 0):.1f}%")

def main():
    """Main application function with improved UI"""
    st.title("📄 IntelliCV: AI-Powered Resume Analyzer")
    st.markdown("""
    Upload your resume and discover which job category it best matches with our AI analysis.
    Supported formats: PDF, DOCX, and TXT.
    """)
    
    # Sidebar with additional info
    with st.sidebar:
        st.header("About")
        st.markdown("""
        IntelliCV uses machine learning to analyze resume content and predict the most suitable job category.
        
        **How it works:**
        1. Upload your resume
        2. AI extracts and cleans the text
        3. Our model predicts the job category
        4. View detailed analysis
        """)
        
        st.markdown("---")
        st.markdown("""
        **Common Categories:**
        - Data Science
        - HR
        - Design
        - Information Technology
        - Education
        - And more...
        """)
    
    # File upload section
    uploaded_file = st.file_uploader(
        "Upload your resume", 
        type=["pdf", "docx", "txt"],
        help="Supported formats: PDF, DOCX, TXT (max 10MB)"
    )
    
    if uploaded_file is not None:
        # Check file size
        if uploaded_file.size > 10 * 1024 * 1024:  # 10MB limit
            st.error("File size exceeds 10MB limit. Please upload a smaller file.")
            return
            
        with st.spinner("🔍 Analyzing your resume..."):
            try:
                resume_text = handle_file_upload(uploaded_file)
                if not resume_text.strip():
                    st.warning("The uploaded file appears to be empty or couldn't be read properly.")
                    return
                    
                category = predict_category(resume_text)
                display_results(resume_text, category)
                
                # Download button for cleaned text
                st.download_button(
                    label="Download Cleaned Text",
                    data=resume_text,
                    file_name="cleaned_resume.txt",
                    mime="text/plain"
                )
                
            except Exception as e:
                st.error(f"An error occurred: {str(e)}")
                st.info("Please try another file or check the format.")

if __name__ == "__main__":
    if svc_model and tfidf and le:
        main()
    else:
        st.error("Failed to load required models. Please try again later.")