Spaces:

rbbist
/

Semantic_Search_CVs

Sleeping

App Files Files Community

rbbist commited on Aug 29, 2025

Commit

c37fd2f

verified ·

1 Parent(s): 8c7a8ea

Update app.py

Browse files

Files changed (1) hide show

app.py +230 -165

app.py CHANGED Viewed

@@ -2,279 +2,344 @@ import gradio as gr
 import os
 from semantic_search import CVSemanticSearch
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize the semantic search system
-cv_search = CVSemanticSearch()
-def upload_cvs(files):
     """
-    Handle CV uploads from Gradio
     Args:
-        files: List of uploaded files from Gradio
     Returns:
-        Status message
     """
-    if not files:
-        return "No files uploaded."
-    successful = 0
-    total = len(files)
-    for file in files:
-        try:
-            # Read file content
-            with open(file.name, 'rb') as f:
-                file_content = f.read()
-            # Get filename from path
-            filename = os.path.basename(file.name)
-            # Add to database
-            if cv_search.add_cv_to_database(file_content, filename):
-                successful += 1
-                logger.info(f"Successfully uploaded: {filename}")
-            else:
-                logger.error(f"Failed to upload: {filename}")
-        except Exception as e:
-            logger.error(f"Error processing file {file.name}: {str(e)}")
-    db_info = cv_search.get_database_info()
-    return f"""
-    Upload Complete!
-    ✅ Successfully processed: {successful}/{total} files
-    📊 Database now contains: {db_info['unique_cvs']} CVs ({db_info['total_chunks']} chunks)
-    CVs in database: {', '.join(db_info['cv_filenames'])}
-    """
-def search_matching_cvs(job_description, num_results):
     """
     Search for CVs matching the job description
     Args:
-        job_description: Job description text
         num_results: Number of results to return
     Returns:
         Formatted search results
     """
-    if not job_description.strip():
-        return "Please enter a job description."
     # Get database info
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
-        return "No CVs in database. Please upload some CV PDFs first."
     # Perform search
     results = cv_search.search_cvs(job_description, top_k=num_results)
     if not results:
-        return "No matching CVs found."
     # Format results
-    output = f"🎯 **Top {len(results)} Matching CVs:**\n\n"
     for i, cv in enumerate(results, 1):
         similarity_percentage = cv['weighted_score'] * 100
         output += f"""
-**{i}. {cv['filename']}**
-- **Match Score**: {similarity_percentage:.1f}%
-- **Max Similarity**: {cv['max_similarity']*100:.1f}%
-- **Avg Similarity**: {cv['avg_similarity']*100:.1f}%
-- **Chunks Analyzed**: {cv['chunk_count']}
-- **Best Match Preview**: {cv['best_match_text']}
 ---
         """
     return output
-def get_database_status():
     """
-    Get current database status
     Returns:
-        Database information as formatted string
     """
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
-        return "📁 Database is empty. Upload some CV PDFs to get started!"
     return f"""
-    📊 **Database Status:**
-    - **Total CVs**: {db_info['unique_cvs']}
     - **Total Chunks**: {db_info['total_chunks']}
-    - **CVs in Database**: {', '.join(db_info['cv_filenames'])}
-    """
-def clear_database():
-    """
-    Clear the entire database
-    Returns:
-        Status message
     """
-    if cv_search.clear_database():
-        return "🗑️ Database cleared successfully!"
-    else:
-        return "❌ Error clearing database."
 # Create Gradio interface
 def create_interface():
     """Create and return the Gradio interface"""
     with gr.Blocks(
-        title="CV Semantic Search",
         theme=gr.themes.Soft(),
         css="""
-        .container { max-width: 1200px; margin: auto; }
-        .upload-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 10px 0; }
-        .search-section { background: #e8f5e8; padding: 20px; border-radius: 10px; margin: 10px 0; }
-        .status-section { background: #fff3cd; padding: 15px; border-radius: 8px; margin: 10px 0; }
         """
     ) as demo:
         gr.Markdown("""
-        # 🔍 CV Semantic Search System
-        Upload CV PDFs and search for the best matches based on job descriptions using AI-powered semantic search.
         """)
         with gr.Row():
-            with gr.Column(scale=1):
-                # Upload Section
-                with gr.Group():
-                    gr.Markdown("## 📁 Upload CVs")
-                    cv_files = gr.File(
-                        label="Upload CV PDFs",
-                        file_count="multiple",
-                        file_types=[".pdf"],
-                        elem_classes=["upload-section"]
-                    )
-                    upload_btn = gr.Button(
-                        "Upload CVs to Database",
-                        variant="primary",
-                        size="lg"
-                    )
-                    upload_output = gr.Markdown(
-                        "Upload CVs to build your searchable database.",
-                        elem_classes=["status-section"]
-                    )
-            with gr.Column(scale=1):
-                # Search Section
-                with gr.Group():
-                    gr.Markdown("## 🎯 Search CVs")
-                    job_description = gr.Textbox(
                         label="Job Description",
-                        placeholder="Enter the job description here...\n\nExample: Looking for a senior software engineer with 5+ years experience in Python, React, and cloud technologies. Strong background in microservices and API development required.",
-                        lines=6,
-                        elem_classes=["search-section"]
-                    )
-                    num_results = gr.Slider(
-                        label="Number of Results",
-                        minimum=1,
-                        maximum=10,
-                        value=5,
-                        step=1
                     )
-                    search_btn = gr.Button(
-                        "Search Matching CVs",
-                        variant="secondary",
-                        size="lg"
                     )
         # Search Results
         with gr.Row():
             search_output = gr.Markdown(
-                "Enter a job description and click search to find matching CVs.",
-                elem_classes=["search-section"]
             )
-        # Database Management
         with gr.Row():
-            with gr.Column(scale=2):
-                status_output = gr.Markdown(
-                    get_database_status(),
-                    elem_classes=["status-section"]
-                )
-            with gr.Column(scale=1):
-                with gr.Group():
-                    refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
-                    clear_btn = gr.Button("🗑️ Clear Database", size="sm", variant="stop")
         # Event handlers
-        upload_btn.click(
-            fn=upload_cvs,
-            inputs=[cv_files],
-            outputs=[upload_output]
-        ).then(
-            fn=get_database_status,
-            outputs=[status_output]
-        )
         search_btn.click(
             fn=search_matching_cvs,
-            inputs=[job_description, num_results],
             outputs=[search_output]
         )
         refresh_btn.click(
-            fn=get_database_status,
-            outputs=[status_output]
         )
-        clear_btn.click(
-            fn=clear_database,
-            outputs=[status_output]
-        ).then(
-            fn=get_database_status,
-            outputs=[status_output]
         )
-        # Example usage
         gr.Markdown("""
-        ## 📝 How to Use:
-        1. **Upload CVs**: Use the file upload component to add multiple PDF CVs to the database
-        2. **Enter Job Description**: Paste or type the job requirements you want to match
-        3. **Search**: Click search to find the top matching CVs based on semantic similarity
-        4. **Review Results**: See ranked CVs with similarity scores and preview text
-        ### 💡 Tips for Better Results:
-        - Include specific skills, technologies, and requirements in your job description
-        - The more detailed your job description, the better the matching accuracy
-        - The system analyzes semantic meaning, not just keyword matching
-        - Upload multiple CVs for better comparison and ranking
         """)
     return demo
-# Main function to run the app
-if __name__ == "__main__":
     demo = create_interface()
     demo.launch(
         share=True,  # Enable sharing for Hugging Face Spaces
         server_name="0.0.0.0",  # Enable access from outside container
         server_port=7860,  # Standard port for Hugging Face Spaces
         show_error=True
-    )

 import os
 from semantic_search import CVSemanticSearch
 import logging
+import PyPDF2
+import io
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Google Drive folder URL - UPDATE THIS WITH YOUR ACTUAL FOLDER URL
+GOOGLE_DRIVE_FOLDER_URL = "https://drive.google.com/drive/folders/XXXXXXXXXXXXXXXXX?usp=sharing"
+# Global variable to store the search system
+cv_search = None
+def initialize_database():
+    """
+    Initialize the database by loading CVs from Google Drive folder
+    This runs once when the space starts
     """
+    global cv_search
+    logger.info("Initializing CV Semantic Search system...")
+    cv_search = CVSemanticSearch()
+    logger.info("Loading CVs from Google Drive folder...")
+    successful, total = cv_search.load_cvs_from_google_drive(GOOGLE_DRIVE_FOLDER_URL)
+    if successful > 0:
+        logger.info(f"Successfully loaded {successful}/{total} CVs into database")
+        return f"✅ Database initialized with {successful}/{total} CVs"
+    else:
+        logger.error("Failed to load any CVs from Google Drive")
+        return "❌ Failed to load CVs from Google Drive. Check the folder URL and permissions."
+def extract_text_from_jd_file(file) -> str:
+    """
+    Extract text from uploaded JD PDF file
     Args:
+        file: Gradio file object
     Returns:
+        Extracted text
     """
+    try:
+        if file is None:
+            return ""
+        with open(file.name, 'rb') as f:
+            pdf_content = f.read()
+        return cv_search.extract_text_from_pdf_bytes(pdf_content)
+    except Exception as e:
+        logger.error(f"Error extracting text from JD file: {str(e)}")
+        return ""
+def process_job_description(jd_text, jd_file):
+    """
+    Process job description from either text input or PDF file
+    Args:
+        jd_text: Job description as text
+        jd_file: Job description as PDF file
+    Returns:
+        Processed job description text
+    """
+    # Priority: PDF file over text input
+    if jd_file is not None:
+        extracted_text = extract_text_from_jd_file(jd_file)
+        if extracted_text.strip():
+            return extracted_text.strip()
+    # Fallback to text input
+    if jd_text and jd_text.strip():
+        return jd_text.strip()
+    return ""
+def search_matching_cvs(jd_text, jd_file, num_results):
     """
     Search for CVs matching the job description
     Args:
+        jd_text: Job description as text
+        jd_file: Job description as PDF file
         num_results: Number of results to return
     Returns:
         Formatted search results
     """
+    global cv_search
+    if cv_search is None:
+        return "❌ System not initialized. Please refresh the page."
+    # Process job description
+    job_description = process_job_description(jd_text, jd_file)
+    if not job_description:
+        return "❌ Please provide a job description either as text or upload a PDF file."
     # Get database info
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
+        return "❌ No CVs in database. Please check the Google Drive folder configuration."
     # Perform search
     results = cv_search.search_cvs(job_description, top_k=num_results)
     if not results:
+        return "❌ No matching CVs found. Try adjusting your job description."
     # Format results
+    output = f"## 🎯 Top {len(results)} Matching CVs\n\n"
+    output += f"**Job Description Preview**: {job_description[:150]}{'...' if len(job_description) > 150 else ''}\n\n"
     for i, cv in enumerate(results, 1):
         similarity_percentage = cv['weighted_score'] * 100
+        # Determine match quality
+        if similarity_percentage >= 80:
+            match_quality = "🟢 Excellent Match"
+        elif similarity_percentage >= 65:
+            match_quality = "🟡 Good Match"
+        elif similarity_percentage >= 50:
+            match_quality = "🟠 Fair Match"
+        else:
+            match_quality = "🔴 Weak Match"
         output += f"""
+### {i}. {cv['filename']} - {match_quality}
+**Overall Score**: {similarity_percentage:.1f}%
+- **Best Match Score**: {cv['max_similarity']*100:.1f}%
+- **Average Score**: {cv['avg_similarity']*100:.1f}%
+- **Sections Analyzed**: {cv['chunk_count']} parts
+**Best Matching Content**:
+"{cv['best_match_text']}"
 ---
         """
     return output
+def get_system_status():
     """
+    Get current system status
     Returns:
+        System information as formatted string
     """
+    global cv_search
+    if cv_search is None:
+        return "❌ System not initialized"
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
+        return """
+        ⚠️ **System Status**: No CVs loaded
+        Please check:
+        - Google Drive folder URL is correct
+        - Folder is public and accessible
+        - Folder contains PDF files
+        """
     return f"""
+    ✅ **System Status**: Ready
+    📊 **Database Info**:
+    - **Total CVs Loaded**: {db_info['unique_cvs']}
     - **Total Chunks**: {db_info['total_chunks']}
+    - **Average Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
+    📁 **Loaded CVs**: {', '.join(db_info['cv_filenames'][:5])}{'...' if len(db_info['cv_filenames']) > 5 else ''}
     """
 # Create Gradio interface
 def create_interface():
     """Create and return the Gradio interface"""
     with gr.Blocks(
+        title="CV Semantic Search - Auto-loaded from Google Drive",
         theme=gr.themes.Soft(),
         css="""
+        .container { max-width: 1000px; margin: auto; }
+        .search-section { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+                         color: white; padding: 25px; border-radius: 15px; margin: 15px 0; }
+        .status-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 15px 0;
+                         border-left: 5px solid #007bff; }
+        .results-section { background: #ffffff; padding: 20px; border-radius: 10px;
+                          border: 1px solid #dee2e6; margin: 15px 0; }
+        .header { text-align: center; padding: 20px; }
         """
     ) as demo:
         gr.Markdown("""
+        <div class="header">
+        # 🚀 CV Semantic Search System
+        ### AI-Powered Resume Matching with Auto-loaded Database
+        *CVs are automatically loaded from Google Drive when the space starts*
+        </div>
         """)
+        # System Status
         with gr.Row():
+            status_display = gr.Markdown(
+                get_system_status(),
+                elem_classes=["status-section"]
+            )
+        # Main Search Interface
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("## 📋 Enter Job Description", elem_classes=["search-section"])
+                with gr.Tab("📝 Text Input"):
+                    jd_text = gr.Textbox(
                         label="Job Description",
+                        placeholder="""Enter your job description here...
+Example:
+We are looking for a Senior Software Engineer with:
+- 5+ years of experience in Python and JavaScript
+- Strong background in machine learning and data science
+- Experience with cloud platforms (AWS, GCP)
+- Knowledge of microservices architecture
+- Bachelor's degree in Computer Science or related field""",
+                        lines=8,
+                        max_lines=15
                     )
+                with gr.Tab("📄 PDF Upload"):
+                    jd_file = gr.File(
+                        label="Upload Job Description PDF",
+                        file_types=[".pdf"],
+                        file_count="single"
                     )
+                num_results = gr.Slider(
+                    label="Number of Top CVs to Return",
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1
+                )
+                search_btn = gr.Button(
+                    "🔍 Find Matching CVs",
+                    variant="primary",
+                    size="lg"
+                )
         # Search Results
         with gr.Row():
             search_output = gr.Markdown(
+                """
+                ## 📋 Instructions:
+                1. **Enter Job Description**: Use text input or upload a PDF
+                2. **Click Search**: Find the best matching CVs from the database
+                3. **Review Results**: See ranked CVs with similarity scores
+                The system automatically analyzes semantic meaning, not just keywords!
+                """,
+                elem_classes=["results-section"]
             )
+        # Refresh button for status
         with gr.Row():
+            refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
         # Event handlers
         search_btn.click(
             fn=search_matching_cvs,
+            inputs=[jd_text, jd_file, num_results],
             outputs=[search_output]
         )
         refresh_btn.click(
+            fn=get_system_status,
+            outputs=[status_display]
         )
+        # Clear inputs when switching tabs
+        jd_file.change(
+            fn=lambda: "",  # Clear text when file is uploaded
+            outputs=[jd_text]
         )
+        # Footer information
         gr.Markdown("""
+        ---
+        ## ℹ️ System Information
+        - **Model**: Sentence Transformers (all-MiniLM-L6-v2)
+        - **Database**: ChromaDB (in-memory, rebuilt on restart)
+        - **CV Source**: Google Drive folder (auto-loaded)
+        - **Search Method**: Semantic similarity matching
+        ### 🎯 How It Works:
+        1. CVs are automatically downloaded from Google Drive and processed into text chunks
+        2. Each chunk is converted to a vector using AI embeddings
+        3. Your job description is compared against all CV chunks using semantic similarity
+        4. Results are ranked by relevance, not just keyword matching
+        ### 💡 Pro Tips:
+        - Be specific about required skills and experience
+        - Include both technical and soft skill requirements
+        - Mention specific tools, technologies, or frameworks
+        - The more detailed your JD, the better the matching accuracy
         """)
     return demo
+def main():
+    """Main function to initialize and run the app"""
+    # Initialize database at startup
+    logger.info("Starting CV Semantic Search application...")
+    init_status = initialize_database()
+    logger.info(f"Initialization result: {init_status}")
+    # Create and launch interface
     demo = create_interface()
     demo.launch(
         share=True,  # Enable sharing for Hugging Face Spaces
         server_name="0.0.0.0",  # Enable access from outside container
         server_port=7860,  # Standard port for Hugging Face Spaces
         show_error=True
+    )
+if __name__ == "__main__":
+    main()