Spaces:

rbbist
/

Semantic_Search_CVs

Sleeping

App Files Files Community

Semantic_Search_CVs / app.py

rbbist

Update app.py

a616854 verified 5 months ago

raw

history blame contribute delete

14.6 kB

	import gradio as gr
	import os
	from semantic_search import CVSemanticSearch
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Google Drive Configuration - UPDATE THESE VALUES
	FOLDER_ID = "1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN" # Replace with your folder ID
	API_KEY = os.getenv("GOOGLE_DRIVE_API_KEY") # Replace with your API key

	# Global variables to store the search system and file mapping
	cv_search = None
	file_mapping = {}
	initialization_status = "Initializing..."

	def initialize_database():
	"""
	Initialize the database by loading CVs from Google Drive folder
	This runs once when the space starts
	"""
	global cv_search, initialization_status, file_mapping

	try:
	logger.info("Initializing CV Semantic Search system...")
	cv_search = CVSemanticSearch()

	logger.info("Loading CVs from Google Drive folder...")
	successful, total, file_map = cv_search.load_cvs_from_google_drive(FOLDER_ID, API_KEY)
	file_mapping = file_map

	if successful > 0:
	initialization_status = f"✅ Successfully loaded {successful}/{total} CVs into database"
	logger.info(initialization_status)
	return True
	else:
	initialization_status = "❌ Failed to load any CVs from Google Drive. Check API key and folder ID."
	logger.error(initialization_status)
	return False

	except Exception as e:
	initialization_status = f"❌ Error during initialization: {str(e)}"
	logger.error(initialization_status)
	return False

	def process_job_description(jd_text, jd_file):
	"""
	Process job description from either text input or PDF file

	Args:
	jd_text: Job description as text
	jd_file: Job description as PDF file

	Returns:
	Processed job description text
	"""
	# Priority: PDF file over text input
	if jd_file is not None:
	try:
	with open(jd_file.name, 'rb') as f:
	pdf_content = f.read()

	extracted_text = cv_search.extract_text_from_pdf_bytes(pdf_content)
	if extracted_text.strip():
	return extracted_text.strip()
	except Exception as e:
	logger.error(f"Error processing JD PDF: {str(e)}")

	# Fallback to text input
	if jd_text and jd_text.strip():
	return jd_text.strip()

	return ""

	def search_matching_cvs(jd_text, jd_file, num_results):
	"""
	Search for CVs matching the job description

	Args:
	jd_text: Job description as text
	jd_file: Job description as PDF file
	num_results: Number of results to return

	Returns:
	Formatted search results
	"""
	global cv_search, file_mapping

	if cv_search is None:
	return f"❌ System not initialized properly.\n\n{initialization_status}\n\nPlease refresh the page or check the configuration."

	# Process job description
	job_description = process_job_description(jd_text, jd_file)

	if not job_description:
	return "❌ Please provide a job description either as text or upload a PDF file."

	# Get database info
	db_info = cv_search.get_database_info()

	if db_info['unique_cvs'] == 0:
	return f"❌ No CVs in database.\n\n{initialization_status}"

	# Perform search
	results = cv_search.search_cvs(job_description, top_k=num_results)

	if not results:
	return "❌ No matching CVs found. Try using different keywords or requirements in your job description."

	# Format results
	jd_preview = job_description[:150] + "..." if len(job_description) > 150 else job_description

	output = f"""# 🎯 Top {len(results)} Matching CVs

	Job Description: {jd_preview}

	---

	"""

	for i, cv in enumerate(results, 1):
	similarity_percentage = cv['weighted_score'] * 100
	filename = cv['filename']

	# Get Google Drive link
	drive_link = "Not available"
	if filename in file_mapping:
	drive_link = file_mapping[filename]['webViewLink']

	# Determine match quality
	if similarity_percentage >= 80:
	match_emoji = "🟢"
	match_text = "Excellent Match"
	elif similarity_percentage >= 65:
	match_emoji = "🟡"
	match_text = "Good Match"
	elif similarity_percentage >= 50:
	match_emoji = "🟠"
	match_text = "Fair Match"
	else:
	match_emoji = "🔴"
	match_text = "Weak Match"

	output += f"""## {i}. {filename}

	{match_emoji} {match_text} - {similarity_percentage:.1f}% Overall Match

	📊 Detailed Scores:
	- Best Section Match: {cv['max_similarity']*100:.1f}%
	- Average Match: {cv['avg_similarity']*100:.1f}%
	- CV Sections Analyzed: {cv['chunk_count']}

	💡 Why This CV Matches:
	"{cv['best_match_text']}"

	🔗 [Open CV in Google Drive]({drive_link})

	---

	"""

	return output

	def get_system_status():
	"""
	Get current system status

	Returns:
	System information as formatted string
	"""
	global cv_search, initialization_status

	if cv_search is None:
	return f"""
	## ⚠️ System Status: Not Ready

	{initialization_status}

	Possible Issues:
	- Invalid Google Drive API key
	- Incorrect folder ID
	- Folder is not public
	- No PDF files in the folder
	"""

	db_info = cv_search.get_database_info()

	if db_info['unique_cvs'] == 0:
	return f"""
	## ⚠️ System Status: No CVs Loaded

	{initialization_status}

	Please Check:
	- Google Drive folder contains PDF files
	- Folder is publicly accessible
	- API key has proper permissions
	"""

	return f"""
	## ✅ System Status: Ready for Search

	📊 Database Statistics:
	- CVs Loaded: {db_info['unique_cvs']} resumes
	- Text Chunks: {db_info['total_chunks']} searchable segments
	- Avg Chunks per CV: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}

	🤖 AI Model: Sentence Transformers (all-MiniLM-L6-v2)

	📁 Sample CVs: {', '.join(db_info['cv_filenames'][:3])}{'...' if len(db_info['cv_filenames']) > 3 else ''}
	"""

	# Create Gradio interface
	def create_interface():
	"""Create and return the Gradio interface"""

	with gr.Blocks(
	title="CV Semantic Search - Auto-loaded from Google Drive",
	theme=gr.themes.Soft(),
	css="""
	.main-container {
	max-width: 1200px;
	margin: auto;
	padding: 20px;
	}
	.search-container {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white !important;
	padding: 30px;
	border-radius: 20px;
	margin: 20px 0;
	box-shadow: 0 10px 30px rgba(0,0,0,0.2);
	}
	.search-container * {
	color: white !important;
	}
	.status-container {
	background: #f8f9fa !important;
	color: #333 !important;
	padding: 25px;
	border-radius: 15px;
	margin: 20px 0;
	border-left: 5px solid #007bff;
	box-shadow: 0 5px 15px rgba(0,0,0,0.1);
	}
	.status-container * {
	color: #333 !important;
	}
	.results-container {
	background: #ffffff !important;
	color: #333 !important;
	padding: 25px;
	border-radius: 15px;
	border: 1px solid #dee2e6;
	margin: 20px 0;
	box-shadow: 0 5px 15px rgba(0,0,0,0.1);
	}
	.results-container * {
	color: #333 !important;
	}
	.header {
	text-align: center;
	padding: 30px;
	background: linear-gradient(135deg, #74b9ff, #0984e3);
	color: white !important;
	margin: -20px -20px 20px -20px;
	border-radius: 15px 15px 0 0;
	}
	.header * {
	color: white !important;
	}
	.tab-content {
	padding: 15px;
	}
	.markdown-content {
	background: #fff !important;
	color: #333 !important;
	padding: 20px;
	border-radius: 10px;
	}
	.markdown-content * {
	color: #333 !important;
	}
	"""
	) as demo:

	with gr.Column(elem_classes=["main-container"]):

	gr.Markdown("""
	<div class="header">

	# 🚀 CV Semantic Search System
	## AI-Powered Resume Matching
	### Automatically synced with Google Drive

	</div>
	""")

	# System Status Display
	with gr.Row():
	status_display = gr.Markdown(
	get_system_status(),
	elem_classes=["status-container", "markdown-content"]
	)

	# Main Search Interface
	with gr.Row():
	with gr.Column():
	with gr.Group(elem_classes=["search-container"]):
	gr.Markdown("## 📋 Job Description Input")

	with gr.Tab("📝 Text Input") as text_tab:
	jd_text = gr.Textbox(
	label="Paste Job Description",
	placeholder="""Paste your job description here...

	Example:
	Senior Software Engineer Position

	Requirements:
	• 5+ years of experience in Python, JavaScript, and React
	• Strong background in machine learning and AI
	• Experience with cloud platforms (AWS, Azure, GCP)
	• Knowledge of microservices and API development
	• Bachelor's degree in Computer Science or related field
	• Excellent problem-solving and communication skills

	Responsibilities:
	• Design and develop scalable web applications
	• Lead technical projects and mentor junior developers
	• Collaborate with cross-functional teams
	• Implement best practices for code quality and testing""",
	lines=12,
	max_lines=20,
	elem_classes=["tab-content"]
	)

	with gr.Tab("📄 PDF Upload") as pdf_tab:
	jd_file = gr.File(
	label="Upload Job Description PDF",
	file_types=[".pdf"],
	file_count="single",
	elem_classes=["tab-content"]
	)

	with gr.Row():
	num_results = gr.Slider(
	label="Number of Top CVs to Return",
	minimum=1,
	maximum=10,
	value=5,
	step=1
	)

	search_btn = gr.Button(
	"🔍 Find Best Matching CVs",
	variant="primary",
	size="lg"
	)

	# Search Results
	with gr.Row():
	search_output = gr.Markdown(
	"""
	# 📋 How to Use This System:

	1. Enter Job Requirements: Use the text box or upload a PDF with your job description
	2. Click Search: The AI will analyze semantic meaning and find the best matches
	3. Review Results: See ranked CVs with detailed similarity scores and explanations

	## 🎯 What Makes This Special:
	- Semantic Understanding: Finds relevant CVs even if they don't use exact keywords
	- Automatic Sync: CVs are always up-to-date from your Google Drive folder
	- Smart Ranking: Combines multiple similarity metrics for accurate results
	- Detailed Analysis: Shows why each CV matches your requirements

	Enter a job description above to get started!
	""",
	elem_classes=["results-container", "markdown-content"]
	)

	# Refresh Status Button
	with gr.Row():
	refresh_btn = gr.Button("🔄 Refresh System Status", size="sm")

	# Event handlers
	search_btn.click(
	fn=search_matching_cvs,
	inputs=[jd_text, jd_file, num_results],
	outputs=[search_output]
	)

	refresh_btn.click(
	fn=get_system_status,
	outputs=[status_display]
	)

	# Clear text input when PDF is uploaded
	jd_file.change(
	fn=lambda: "",
	outputs=[jd_text]
	)

	# Clear file input when text is entered
	jd_text.change(
	fn=lambda x: None if x.strip() else None,
	inputs=[jd_text],
	outputs=[jd_file]
	)

	# Footer
	gr.Markdown("""
	---
	# 🛠️ Technical Details

	- Vector Database: ChromaDB (rebuilt on each restart)
	- Embedding Model: SentenceTransformers all-MiniLM-L6-v2
	- Text Extraction: pdfplumber + OCR fallback for scanned documents
	- CV Source: Google Drive folder (automatically synced)
	- Search Algorithm: Cosine similarity with chunk aggregation

	## 📞 Support
	If no results appear, check that:
	- Your Google Drive folder is public
	- The folder contains PDF files
	- Your API key is valid and has Drive API access
	""", elem_classes=["markdown-content"])

	return demo

	def main():
	"""Main function to initialize and run the app"""

	logger.info("Starting CV Semantic Search application...")

	# Initialize database at startup
	if initialize_database():
	logger.info("✅ Database initialization successful")
	else:
	logger.error("❌ Database initialization failed")

	# Create and launch interface
	demo = create_interface()
	demo.launch(
	share=True, # Enable sharing for Hugging Face Spaces
	server_name="0.0.0.0", # Enable access from outside container
	server_port=7860, # Standard port for Hugging Face Spaces
	show_error=True
	)

	if __name__ == "__main__":
	main()