rbbist's picture
Update app.py
a616854 verified
import gradio as gr
import os
from semantic_search import CVSemanticSearch
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Google Drive Configuration - UPDATE THESE VALUES
FOLDER_ID = "1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN" # Replace with your folder ID
API_KEY = os.getenv("GOOGLE_DRIVE_API_KEY") # Replace with your API key
# Global variables to store the search system and file mapping
cv_search = None
file_mapping = {}
initialization_status = "Initializing..."
def initialize_database():
"""
Initialize the database by loading CVs from Google Drive folder
This runs once when the space starts
"""
global cv_search, initialization_status, file_mapping
try:
logger.info("Initializing CV Semantic Search system...")
cv_search = CVSemanticSearch()
logger.info("Loading CVs from Google Drive folder...")
successful, total, file_map = cv_search.load_cvs_from_google_drive(FOLDER_ID, API_KEY)
file_mapping = file_map
if successful > 0:
initialization_status = f"βœ… Successfully loaded {successful}/{total} CVs into database"
logger.info(initialization_status)
return True
else:
initialization_status = "❌ Failed to load any CVs from Google Drive. Check API key and folder ID."
logger.error(initialization_status)
return False
except Exception as e:
initialization_status = f"❌ Error during initialization: {str(e)}"
logger.error(initialization_status)
return False
def process_job_description(jd_text, jd_file):
"""
Process job description from either text input or PDF file
Args:
jd_text: Job description as text
jd_file: Job description as PDF file
Returns:
Processed job description text
"""
# Priority: PDF file over text input
if jd_file is not None:
try:
with open(jd_file.name, 'rb') as f:
pdf_content = f.read()
extracted_text = cv_search.extract_text_from_pdf_bytes(pdf_content)
if extracted_text.strip():
return extracted_text.strip()
except Exception as e:
logger.error(f"Error processing JD PDF: {str(e)}")
# Fallback to text input
if jd_text and jd_text.strip():
return jd_text.strip()
return ""
def search_matching_cvs(jd_text, jd_file, num_results):
"""
Search for CVs matching the job description
Args:
jd_text: Job description as text
jd_file: Job description as PDF file
num_results: Number of results to return
Returns:
Formatted search results
"""
global cv_search, file_mapping
if cv_search is None:
return f"❌ System not initialized properly.\n\n{initialization_status}\n\nPlease refresh the page or check the configuration."
# Process job description
job_description = process_job_description(jd_text, jd_file)
if not job_description:
return "❌ Please provide a job description either as text or upload a PDF file."
# Get database info
db_info = cv_search.get_database_info()
if db_info['unique_cvs'] == 0:
return f"❌ No CVs in database.\n\n{initialization_status}"
# Perform search
results = cv_search.search_cvs(job_description, top_k=num_results)
if not results:
return "❌ No matching CVs found. Try using different keywords or requirements in your job description."
# Format results
jd_preview = job_description[:150] + "..." if len(job_description) > 150 else job_description
output = f"""# 🎯 Top {len(results)} Matching CVs
**Job Description**: {jd_preview}
---
"""
for i, cv in enumerate(results, 1):
similarity_percentage = cv['weighted_score'] * 100
filename = cv['filename']
# Get Google Drive link
drive_link = "Not available"
if filename in file_mapping:
drive_link = file_mapping[filename]['webViewLink']
# Determine match quality
if similarity_percentage >= 80:
match_emoji = "🟒"
match_text = "Excellent Match"
elif similarity_percentage >= 65:
match_emoji = "🟑"
match_text = "Good Match"
elif similarity_percentage >= 50:
match_emoji = "🟠"
match_text = "Fair Match"
else:
match_emoji = "πŸ”΄"
match_text = "Weak Match"
output += f"""## {i}. {filename}
**{match_emoji} {match_text}** - **{similarity_percentage:.1f}% Overall Match**
πŸ“Š **Detailed Scores:**
- Best Section Match: {cv['max_similarity']*100:.1f}%
- Average Match: {cv['avg_similarity']*100:.1f}%
- CV Sections Analyzed: {cv['chunk_count']}
πŸ’‘ **Why This CV Matches:**
*"{cv['best_match_text']}"*
πŸ”— **[Open CV in Google Drive]({drive_link})**
---
"""
return output
def get_system_status():
"""
Get current system status
Returns:
System information as formatted string
"""
global cv_search, initialization_status
if cv_search is None:
return f"""
## ⚠️ System Status: Not Ready
{initialization_status}
**Possible Issues:**
- Invalid Google Drive API key
- Incorrect folder ID
- Folder is not public
- No PDF files in the folder
"""
db_info = cv_search.get_database_info()
if db_info['unique_cvs'] == 0:
return f"""
## ⚠️ System Status: No CVs Loaded
{initialization_status}
**Please Check:**
- Google Drive folder contains PDF files
- Folder is publicly accessible
- API key has proper permissions
"""
return f"""
## βœ… System Status: Ready for Search
πŸ“Š **Database Statistics:**
- **CVs Loaded**: {db_info['unique_cvs']} resumes
- **Text Chunks**: {db_info['total_chunks']} searchable segments
- **Avg Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
πŸ€– **AI Model**: Sentence Transformers (all-MiniLM-L6-v2)
πŸ“ **Sample CVs**: {', '.join(db_info['cv_filenames'][:3])}{'...' if len(db_info['cv_filenames']) > 3 else ''}
"""
# Create Gradio interface
def create_interface():
"""Create and return the Gradio interface"""
with gr.Blocks(
title="CV Semantic Search - Auto-loaded from Google Drive",
theme=gr.themes.Soft(),
css="""
.main-container {
max-width: 1200px;
margin: auto;
padding: 20px;
}
.search-container {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white !important;
padding: 30px;
border-radius: 20px;
margin: 20px 0;
box-shadow: 0 10px 30px rgba(0,0,0,0.2);
}
.search-container * {
color: white !important;
}
.status-container {
background: #f8f9fa !important;
color: #333 !important;
padding: 25px;
border-radius: 15px;
margin: 20px 0;
border-left: 5px solid #007bff;
box-shadow: 0 5px 15px rgba(0,0,0,0.1);
}
.status-container * {
color: #333 !important;
}
.results-container {
background: #ffffff !important;
color: #333 !important;
padding: 25px;
border-radius: 15px;
border: 1px solid #dee2e6;
margin: 20px 0;
box-shadow: 0 5px 15px rgba(0,0,0,0.1);
}
.results-container * {
color: #333 !important;
}
.header {
text-align: center;
padding: 30px;
background: linear-gradient(135deg, #74b9ff, #0984e3);
color: white !important;
margin: -20px -20px 20px -20px;
border-radius: 15px 15px 0 0;
}
.header * {
color: white !important;
}
.tab-content {
padding: 15px;
}
.markdown-content {
background: #fff !important;
color: #333 !important;
padding: 20px;
border-radius: 10px;
}
.markdown-content * {
color: #333 !important;
}
"""
) as demo:
with gr.Column(elem_classes=["main-container"]):
gr.Markdown("""
<div class="header">
# πŸš€ CV Semantic Search System
## AI-Powered Resume Matching
### *Automatically synced with Google Drive*
</div>
""")
# System Status Display
with gr.Row():
status_display = gr.Markdown(
get_system_status(),
elem_classes=["status-container", "markdown-content"]
)
# Main Search Interface
with gr.Row():
with gr.Column():
with gr.Group(elem_classes=["search-container"]):
gr.Markdown("## πŸ“‹ Job Description Input")
with gr.Tab("πŸ“ Text Input") as text_tab:
jd_text = gr.Textbox(
label="Paste Job Description",
placeholder="""Paste your job description here...
Example:
Senior Software Engineer Position
Requirements:
β€’ 5+ years of experience in Python, JavaScript, and React
β€’ Strong background in machine learning and AI
β€’ Experience with cloud platforms (AWS, Azure, GCP)
β€’ Knowledge of microservices and API development
β€’ Bachelor's degree in Computer Science or related field
β€’ Excellent problem-solving and communication skills
Responsibilities:
β€’ Design and develop scalable web applications
β€’ Lead technical projects and mentor junior developers
β€’ Collaborate with cross-functional teams
β€’ Implement best practices for code quality and testing""",
lines=12,
max_lines=20,
elem_classes=["tab-content"]
)
with gr.Tab("πŸ“„ PDF Upload") as pdf_tab:
jd_file = gr.File(
label="Upload Job Description PDF",
file_types=[".pdf"],
file_count="single",
elem_classes=["tab-content"]
)
with gr.Row():
num_results = gr.Slider(
label="Number of Top CVs to Return",
minimum=1,
maximum=10,
value=5,
step=1
)
search_btn = gr.Button(
"πŸ” Find Best Matching CVs",
variant="primary",
size="lg"
)
# Search Results
with gr.Row():
search_output = gr.Markdown(
"""
# πŸ“‹ How to Use This System:
1. **Enter Job Requirements**: Use the text box or upload a PDF with your job description
2. **Click Search**: The AI will analyze semantic meaning and find the best matches
3. **Review Results**: See ranked CVs with detailed similarity scores and explanations
## 🎯 What Makes This Special:
- **Semantic Understanding**: Finds relevant CVs even if they don't use exact keywords
- **Automatic Sync**: CVs are always up-to-date from your Google Drive folder
- **Smart Ranking**: Combines multiple similarity metrics for accurate results
- **Detailed Analysis**: Shows why each CV matches your requirements
*Enter a job description above to get started!*
""",
elem_classes=["results-container", "markdown-content"]
)
# Refresh Status Button
with gr.Row():
refresh_btn = gr.Button("πŸ”„ Refresh System Status", size="sm")
# Event handlers
search_btn.click(
fn=search_matching_cvs,
inputs=[jd_text, jd_file, num_results],
outputs=[search_output]
)
refresh_btn.click(
fn=get_system_status,
outputs=[status_display]
)
# Clear text input when PDF is uploaded
jd_file.change(
fn=lambda: "",
outputs=[jd_text]
)
# Clear file input when text is entered
jd_text.change(
fn=lambda x: None if x.strip() else None,
inputs=[jd_text],
outputs=[jd_file]
)
# Footer
gr.Markdown("""
---
# πŸ› οΈ Technical Details
- **Vector Database**: ChromaDB (rebuilt on each restart)
- **Embedding Model**: SentenceTransformers all-MiniLM-L6-v2
- **Text Extraction**: pdfplumber + OCR fallback for scanned documents
- **CV Source**: Google Drive folder (automatically synced)
- **Search Algorithm**: Cosine similarity with chunk aggregation
## πŸ“ž Support
If no results appear, check that:
- Your Google Drive folder is public
- The folder contains PDF files
- Your API key is valid and has Drive API access
""", elem_classes=["markdown-content"])
return demo
def main():
"""Main function to initialize and run the app"""
logger.info("Starting CV Semantic Search application...")
# Initialize database at startup
if initialize_database():
logger.info("βœ… Database initialization successful")
else:
logger.error("❌ Database initialization failed")
# Create and launch interface
demo = create_interface()
demo.launch(
share=True, # Enable sharing for Hugging Face Spaces
server_name="0.0.0.0", # Enable access from outside container
server_port=7860, # Standard port for Hugging Face Spaces
show_error=True
)
if __name__ == "__main__":
main()