import streamlit as st import pandas as pd import numpy as np import plotly.graph_objects as go import plotly.express as px from pathlib import Path import pickle import gdown import zipfile import os # Page config st.set_page_config( page_title="Skills-Based Job Matching System", page_icon="💼", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS (your existing CSS here) st.markdown(""" """, unsafe_allow_html=True) # ============================================ # GOOGLE DRIVE SETUP # ============================================ @st.cache_resource def download_from_gdrive(): """Download and extract project data from Google Drive""" # REPLACE THIS with your Google Drive file ID GDRIVE_FILE_ID = "1mUUvKpFX1usIpLu-dSiYc-F6Zogfw95o" # File paths zip_path = "project_data.zip" # Check if data already exists data_exists = ( Path("data/processed_jobs.parquet").exists() and Path("models").exists() ) if data_exists: return True, "Data already loaded" try: # Download from Google Drive st.info("📥 Downloading data from Google Drive (1.2GB)...") st.info("This is a one-time download and will be cached. Please wait...") url = f"https://drive.google.com/uc?id={GDRIVE_FILE_ID}" # Create progress bar progress_bar = st.progress(0) status_text = st.empty() def show_progress(current, total): if total > 0: progress = int((current / total) * 100) progress_bar.progress(min(progress, 100)) status_text.text(f"Downloading: {progress}% ({current/1024/1024:.1f}MB / {total/1024/1024:.1f}MB)") # Download with progress output = gdown.download(url, zip_path, quiet=False) if output is None: return False, "Failed to download. Please check if Google Drive link is public." progress_bar.progress(100) status_text.text("Download complete!") # Extract zip file st.info("📦 Extracting files...") with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Get total files for progress total_files = len(zip_ref.filelist) extract_progress = st.progress(0) for i, file in enumerate(zip_ref.filelist): zip_ref.extract(file, ".") extract_progress.progress((i + 1) / total_files) # Clean up os.remove(zip_path) return True, "✓ Data downloaded and ready!" except Exception as e: return False, f"Error: {str(e)}\n\nPlease ensure:\n1. Google Drive link is set to 'Anyone with the link'\n2. File ID is correct\n3. File is not corrupted" # Download data first download_success, download_message = download_from_gdrive() if not download_success: st.error("❌ Failed to load data") st.error(download_message) st.info(""" **Setup Instructions:** 1. Upload your 1.2GB zip file to Google Drive 2. Right-click → Get link → Set to "Anyone with the link" 3. Copy the file ID from the link 4. Update GDRIVE_FILE_ID in the code Example link: `https://drive.google.com/file/d/1a2b3c4d5e6f7g8h9i0j/view` File ID: `1a2b3c4d5e6f7g8h9i0j` """) st.stop() else: st.success(download_message) # ============================================ # YOUR EXISTING CODE CONTINUES HERE # ============================================ # Initialize session state if 'user_profile' not in st.session_state: st.session_state.user_profile = { 'skills': [], 'experience_level': 2, 'min_salary': 0, 'city': '', 'state': '', 'remote_only': False, 'company_size': -1, 'benefits': [] } if 'search_results' not in st.session_state: st.session_state.search_results = None if 'selected_job' not in st.session_state: st.session_state.selected_job = None # Import your modules (make sure they're in the zip) try: from skill_extractor import SkillExtractor from ranking_pipeline import RankingPipeline except ImportError as e: st.error(f"Error importing modules: {e}") st.info("Make sure skill_extractor.py and ranking_pipeline.py are in your zip file") st.stop() # Cache data loading @st.cache_resource def load_models_and_data(): """Load all models and data.""" try: # Load processed data jobs_df = pd.read_parquet('data/processed_jobs.parquet') # Initialize pipeline pipeline = RankingPipeline(model_dir='models') pipeline.jobs_df = jobs_df pipeline.load_indices() pipeline.load_ltr_model() # Load skill extractor skill_extractor = SkillExtractor() return pipeline, jobs_df, skill_extractor except Exception as e: st.error(f"Error loading models: {e}") raise e try: pipeline, jobs_df, skill_extractor = load_models_and_data() data_loaded = True except Exception as e: st.error(f"Error loading data: {e}") st.info("Please check that your zip file contains:") st.code(""" ├── data/ │ └── processed_jobs.parquet ├── models/ │ └── (your model files) ├── skill_extractor.py └── ranking_pipeline.py """) data_loaded = False st.stop() # ============================================ # REST OF YOUR EXISTING APP CODE # ============================================ # Sidebar - User Profile with st.sidebar: st.markdown("### 👤 User Profile") # Skills input st.markdown("#### Skills") all_skills = skill_extractor.get_all_skills() if data_loaded else [] selected_skills = st.multiselect( "Select your skills", options=all_skills, default=st.session_state.user_profile['skills'], help="Start typing to search skills" ) st.session_state.user_profile['skills'] = selected_skills # ... (rest of your sidebar code) # Main content st.markdown('
Find your perfect job match using AI-powered ranking
', unsafe_allow_html=True) # ... (rest of your existing code)