Spaces:

rbbist
/

Semantic_Search_CVs

Sleeping

App Files Files Community

rbbist commited on Sep 1, 2025

Commit

5e17ee3

verified ·

1 Parent(s): 390b438

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -170

app.py CHANGED Viewed

@@ -2,61 +2,46 @@ import gradio as gr
 import os
 from semantic_search import CVSemanticSearch
 import logging
-import PyPDF2
-import io
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Google Drive folder URL - UPDATE THIS WITH YOUR ACTUAL FOLDER URL
-GOOGLE_DRIVE_FOLDER_URL = "https://drive.google.com/drive/folders/1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN"
 # Global variable to store the search system
 cv_search = None
 def initialize_database():
     """
     Initialize the database by loading CVs from Google Drive folder
     This runs once when the space starts
     """
-    global cv_search
-    logger.info("Initializing CV Semantic Search system...")
-    cv_search = CVSemanticSearch()
-    logger.info("Loading CVs from Google Drive folder...")
-    successful, total = cv_search.load_cvs_from_google_drive(GOOGLE_DRIVE_FOLDER_URL)
-    if successful > 0:
-        logger.info(f"Successfully loaded {successful}/{total} CVs into database")
-        return f"✅ Database initialized with {successful}/{total} CVs"
-    else:
-        logger.error("Failed to load any CVs from Google Drive")
-        return "❌ Failed to load CVs from Google Drive. Check the folder URL and permissions."
-def extract_text_from_jd_file(file) -> str:
-    """
-    Extract text from uploaded JD PDF file
-    Args:
-        file: Gradio file object
-    Returns:
-        Extracted text
-    """
     try:
-        if file is None:
-            return ""
-        with open(file.name, 'rb') as f:
-            pdf_content = f.read()
-        return cv_search.extract_text_from_pdf_bytes(pdf_content)
     except Exception as e:
-        logger.error(f"Error extracting text from JD file: {str(e)}")
-        return ""
 def process_job_description(jd_text, jd_file):
     """
@@ -71,9 +56,15 @@ def process_job_description(jd_text, jd_file):
     """
     # Priority: PDF file over text input
     if jd_file is not None:
-        extracted_text = extract_text_from_jd_file(jd_file)
-        if extracted_text.strip():
-            return extracted_text.strip()
     # Fallback to text input
     if jd_text and jd_text.strip():
@@ -96,7 +87,7 @@ def search_matching_cvs(jd_text, jd_file, num_results):
     global cv_search
     if cv_search is None:
-        return "❌ System not initialized. Please refresh the page."
     # Process job description
     job_description = process_job_description(jd_text, jd_file)
@@ -108,41 +99,58 @@ def search_matching_cvs(jd_text, jd_file, num_results):
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
-        return "❌ No CVs in database. Please check the Google Drive folder configuration."
     # Perform search
     results = cv_search.search_cvs(job_description, top_k=num_results)
     if not results:
-        return "❌ No matching CVs found. Try adjusting your job description."
     # Format results
-    output = f"## 🎯 Top {len(results)} Matching CVs\n\n"
-    output += f"**Job Description Preview**: {job_description[:150]}{'...' if len(job_description) > 150 else ''}\n\n"
     for i, cv in enumerate(results, 1):
         similarity_percentage = cv['weighted_score'] * 100
-        # Determine match quality
         if similarity_percentage >= 80:
             match_quality = "🟢 Excellent Match"
         elif similarity_percentage >= 65:
             match_quality = "🟡 Good Match"
         elif similarity_percentage >= 50:
             match_quality = "🟠 Fair Match"
         else:
             match_quality = "🔴 Weak Match"
         output += f"""
-### {i}. {cv['filename']} - {match_quality}
-**Overall Score**: {similarity_percentage:.1f}%
-- **Best Match Score**: {cv['max_similarity']*100:.1f}%
-- **Average Score**: {cv['avg_similarity']*100:.1f}%
-- **Sections Analyzed**: {cv['chunk_count']} parts
-**Best Matching Content**:
-"{cv['best_match_text']}"
 ---
         """
@@ -156,32 +164,46 @@ def get_system_status():
     Returns:
         System information as formatted string
     """
-    global cv_search
     if cv_search is None:
-        return "❌ System not initialized"
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
-        return """
-        ⚠️ **System Status**: No CVs loaded
-        Please check:
-        - Google Drive folder URL is correct
-        - Folder is public and accessible
-        - Folder contains PDF files
         """
     return f"""
-    ✅ **System Status**: Ready
-    📊 **Database Info**:
-    - **Total CVs Loaded**: {db_info['unique_cvs']}
-    - **Total Chunks**: {db_info['total_chunks']}
-    - **Average Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
-    📁 **Loaded CVs**: {', '.join(db_info['cv_filenames'][:5])}{'...' if len(db_info['cv_filenames']) > 5 else ''}
     """
 # Create Gradio interface
@@ -192,94 +214,124 @@ def create_interface():
         title="CV Semantic Search - Auto-loaded from Google Drive",
         theme=gr.themes.Soft(),
         css="""
-        .container { max-width: 1000px; margin: auto; }
-        .search-section { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-                         color: white; padding: 25px; border-radius: 15px; margin: 15px 0; }
-        .status-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 15px 0;
-                         border-left: 5px solid #007bff; }
-        .results-section { background: #ffffff; padding: 20px; border-radius: 10px;
-                          border: 1px solid #dee2e6; margin: 15px 0; }
-        .header { text-align: center; padding: 20px; }
         """
     ) as demo:
-        gr.Markdown("""
-        <div class="header">
-        # 🚀 CV Semantic Search System
-        ### AI-Powered Resume Matching with Auto-loaded Database
-        *CVs are automatically loaded from Google Drive when the space starts*
-        </div>
-        """)
-        # System Status
-        with gr.Row():
-            status_display = gr.Markdown(
-                get_system_status(),
-                elem_classes=["status-section"]
-            )
-        # Main Search Interface
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("## 📋 Enter Job Description", elem_classes=["search-section"])
-                with gr.Tab("📝 Text Input"):
-                    jd_text = gr.Textbox(
-                        label="Job Description",
-                        placeholder="""Enter your job description here...
 Example:
-We are looking for a Senior Software Engineer with:
-- 5+ years of experience in Python and JavaScript
-- Strong background in machine learning and data science
-- Experience with cloud platforms (AWS, GCP)
-- Knowledge of microservices architecture
-- Bachelor's degree in Computer Science or related field""",
-                        lines=8,
-                        max_lines=15
-                    )
-                with gr.Tab("📄 PDF Upload"):
-                    jd_file = gr.File(
-                        label="Upload Job Description PDF",
-                        file_types=[".pdf"],
-                        file_count="single"
-                    )
-                num_results = gr.Slider(
-                    label="Number of Top CVs to Return",
-                    minimum=1,
-                    maximum=10,
-                    value=5,
-                    step=1
-                )
-                search_btn = gr.Button(
-                    "🔍 Find Matching CVs",
-                    variant="primary",
-                    size="lg"
                 )
-        # Search Results
-        with gr.Row():
-            search_output = gr.Markdown(
-                """
-                ## 📋 Instructions:
-                1. **Enter Job Description**: Use text input or upload a PDF
-                2. **Click Search**: Find the best matching CVs from the database
-                3. **Review Results**: See ranked CVs with similarity scores
-                The system automatically analyzes semantic meaning, not just keywords!
-                """,
-                elem_classes=["results-section"]
-            )
-        # Refresh button for status
-        with gr.Row():
-            refresh_btn = gr.Button("🔄 Refresh Status", size="sm")
         # Event handlers
         search_btn.click(
@@ -293,33 +345,35 @@ We are looking for a Senior Software Engineer with:
             outputs=[status_display]
         )
-        # Clear inputs when switching tabs
         jd_file.change(
-            fn=lambda: "",  # Clear text when file is uploaded
             outputs=[jd_text]
         )
-        # Footer information
         gr.Markdown("""
         ---
-        ## ℹ️ System Information
-        - **Model**: Sentence Transformers (all-MiniLM-L6-v2)
-        - **Database**: ChromaDB (in-memory, rebuilt on restart)
-        - **CV Source**: Google Drive folder (auto-loaded)
-        - **Search Method**: Semantic similarity matching
-        ### 🎯 How It Works:
-        1. CVs are automatically downloaded from Google Drive and processed into text chunks
-        2. Each chunk is converted to a vector using AI embeddings
-        3. Your job description is compared against all CV chunks using semantic similarity
-        4. Results are ranked by relevance, not just keyword matching
-        ### 💡 Pro Tips:
-        - Be specific about required skills and experience
-        - Include both technical and soft skill requirements
-        - Mention specific tools, technologies, or frameworks
-        - The more detailed your JD, the better the matching accuracy
         """)
     return demo
@@ -327,10 +381,13 @@ We are looking for a Senior Software Engineer with:
 def main():
     """Main function to initialize and run the app"""
-    # Initialize database at startup
     logger.info("Starting CV Semantic Search application...")
-    init_status = initialize_database()
-    logger.info(f"Initialization result: {init_status}")
     # Create and launch interface
     demo = create_interface()

 import os
 from semantic_search import CVSemanticSearch
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Google Drive Configuration - UPDATE THESE VALUES
+FOLDER_ID = "1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN"  # Replace with your folder ID
+API_KEY = os.getenv("YOUR_GOOGLE_DRIVE_API_KEY")  # Replace with your API key
 # Global variable to store the search system
 cv_search = None
+initialization_status = "Initializing..."
 def initialize_database():
     """
     Initialize the database by loading CVs from Google Drive folder
     This runs once when the space starts
     """
+    global cv_search, initialization_status
     try:
+        logger.info("Initializing CV Semantic Search system...")
+        cv_search = CVSemanticSearch()
+        logger.info("Loading CVs from Google Drive folder...")
+        successful, total = cv_search.load_cvs_from_google_drive(FOLDER_ID, API_KEY)
+        if successful > 0:
+            initialization_status = f"✅ Successfully loaded {successful}/{total} CVs into database"
+            logger.info(initialization_status)
+            return True
+        else:
+            initialization_status = "❌ Failed to load any CVs from Google Drive. Check API key and folder ID."
+            logger.error(initialization_status)
+            return False
     except Exception as e:
+        initialization_status = f"❌ Error during initialization: {str(e)}"
+        logger.error(initialization_status)
+        return False
 def process_job_description(jd_text, jd_file):
     """
     """
     # Priority: PDF file over text input
     if jd_file is not None:
+        try:
+            with open(jd_file.name, 'rb') as f:
+                pdf_content = f.read()
+            extracted_text = cv_search.extract_text_from_pdf_bytes(pdf_content)
+            if extracted_text.strip():
+                return extracted_text.strip()
+        except Exception as e:
+            logger.error(f"Error processing JD PDF: {str(e)}")
     # Fallback to text input
     if jd_text and jd_text.strip():
     global cv_search
     if cv_search is None:
+        return f"❌ System not initialized properly.\n\n{initialization_status}\n\nPlease refresh the page or check the configuration."
     # Process job description
     job_description = process_job_description(jd_text, jd_file)
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
+        return f"❌ No CVs in database.\n\n{initialization_status}"
     # Perform search
     results = cv_search.search_cvs(job_description, top_k=num_results)
     if not results:
+        return "❌ No matching CVs found. Try using different keywords or requirements in your job description."
     # Format results
+    jd_preview = job_description[:150] + "..." if len(job_description) > 150 else job_description
+    output = f"""## 🎯 Top {len(results)} Matching CVs
+**Job Description**: {jd_preview}
+**Search Results**:
+"""
     for i, cv in enumerate(results, 1):
         similarity_percentage = cv['weighted_score'] * 100
+        # Determine match quality and emoji
         if similarity_percentage >= 80:
             match_quality = "🟢 Excellent Match"
+            quality_color = "#28a745"
         elif similarity_percentage >= 65:
             match_quality = "🟡 Good Match"
+            quality_color = "#ffc107"
         elif similarity_percentage >= 50:
             match_quality = "🟠 Fair Match"
+            quality_color = "#fd7e14"
         else:
             match_quality = "🔴 Weak Match"
+            quality_color = "#dc3545"
         output += f"""
+### {i}. **{cv['filename']}**
+<div style="background: linear-gradient(90deg, {quality_color}22, transparent); padding: 15px; border-radius: 8px; border-left: 4px solid {quality_color};">
+**{match_quality}** - **{similarity_percentage:.1f}% Overall Match**
+📊 **Detailed Scores:**
+- Best Section Match: {cv['max_similarity']*100:.1f}%
+- Average Match: {cv['avg_similarity']*100:.1f}%
+- CV Sections Analyzed: {cv['chunk_count']}
+💡 **Why This CV Matches:**
+*"{cv['best_match_text']}"*
+</div>
 ---
         """
     Returns:
         System information as formatted string
     """
+    global cv_search, initialization_status
     if cv_search is None:
+        return f"""
+        ## ⚠️ System Status: Not Ready
+        {initialization_status}
+        **Possible Issues:**
+        - Invalid Google Drive API key
+        - Incorrect folder ID
+        - Folder is not public
+        - No PDF files in the folder
+        """
     db_info = cv_search.get_database_info()
     if db_info['unique_cvs'] == 0:
+        return f"""
+        ## ⚠️ System Status: No CVs Loaded
+        {initialization_status}
+        **Please Check:**
+        - Google Drive folder contains PDF files
+        - Folder is publicly accessible
+        - API key has proper permissions
         """
     return f"""
+    ## ✅ System Status: Ready for Search
+    📊 **Database Statistics:**
+    - **CVs Loaded**: {db_info['unique_cvs']} resumes
+    - **Text Chunks**: {db_info['total_chunks']} searchable segments
+    - **Avg Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
+    🤖 **AI Model**: Sentence Transformers (all-MiniLM-L6-v2)
+    📁 **Sample CVs**: {', '.join(db_info['cv_filenames'][:3])}{'...' if len(db_info['cv_filenames']) > 3 else ''}
     """
 # Create Gradio interface
         title="CV Semantic Search - Auto-loaded from Google Drive",
         theme=gr.themes.Soft(),
         css="""
+        .main-container { max-width: 1200px; margin: auto; padding: 20px; }
+        .search-container {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white; padding: 30px; border-radius: 20px; margin: 20px 0;
+            box-shadow: 0 10px 30px rgba(0,0,0,0.2);
+        }
+        .status-container {
+            background: #f8f9fa; padding: 25px; border-radius: 15px; margin: 20px 0;
+            border-left: 5px solid #007bff; box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+        }
+        .results-container {
+            background: #ffffff; padding: 25px; border-radius: 15px;
+            border: 1px solid #dee2e6; margin: 20px 0; box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+        }
+        .header { text-align: center; padding: 30px; background: linear-gradient(135deg, #74b9ff, #0984e3);
+                 color: white; margin: -20px -20px 20px -20px; border-radius: 15px 15px 0 0; }
+        .tab-content { padding: 15px; }
         """
     ) as demo:
+        with gr.Column(elem_classes=["main-container"]):
+            gr.Markdown("""
+            <div class="header">
+            # 🚀 CV Semantic Search System
+            ## AI-Powered Resume Matching
+            ### *Automatically synced with Google Drive*
+            </div>
+            """)
+            # System Status Display
+            with gr.Row():
+                status_display = gr.Markdown(
+                    get_system_status(),
+                    elem_classes=["status-container"]
+                )
+            # Main Search Interface
+            with gr.Row():
+                with gr.Column():
+                    with gr.Group(elem_classes=["search-container"]):
+                        gr.Markdown("## 📋 Job Description Input")
+                        with gr.Tab("📝 Text Input") as text_tab:
+                            jd_text = gr.Textbox(
+                                label="Paste Job Description",
+                                placeholder="""Paste your job description here...
 Example:
+Senior Software Engineer Position
+Requirements:
+• 5+ years of experience in Python, JavaScript, and React
+• Strong background in machine learning and AI
+• Experience with cloud platforms (AWS, Azure, GCP)
+• Knowledge of microservices and API development
+• Bachelor's degree in Computer Science or related field
+• Excellent problem-solving and communication skills
+Responsibilities:
+• Design and develop scalable web applications
+• Lead technical projects and mentor junior developers
+• Collaborate with cross-functional teams
+• Implement best practices for code quality and testing""",
+                                lines=12,
+                                max_lines=20,
+                                elem_classes=["tab-content"]
+                            )
+                        with gr.Tab("📄 PDF Upload") as pdf_tab:
+                            jd_file = gr.File(
+                                label="Upload Job Description PDF",
+                                file_types=[".pdf"],
+                                file_count="single",
+                                elem_classes=["tab-content"]
+                            )
+                        with gr.Row():
+                            num_results = gr.Slider(
+                                label="Number of Top CVs to Return",
+                                minimum=1,
+                                maximum=10,
+                                value=5,
+                                step=1
+                            )
+                        search_btn = gr.Button(
+                            "🔍 Find Best Matching CVs",
+                            variant="primary",
+                            size="lg"
+                        )
+            # Search Results
+            with gr.Row():
+                search_output = gr.Markdown(
+                    """
+                    ## 📋 How to Use This System:
+                    1. **Enter Job Requirements**: Use the text box or upload a PDF with your job description
+                    2. **Click Search**: The AI will analyze semantic meaning and find the best matches
+                    3. **Review Results**: See ranked CVs with detailed similarity scores and explanations
+                    ### 🎯 What Makes This Special:
+                    - **Semantic Understanding**: Finds relevant CVs even if they don't use exact keywords
+                    - **Automatic Sync**: CVs are always up-to-date from your Google Drive folder
+                    - **Smart Ranking**: Combines multiple similarity metrics for accurate results
+                    - **Detailed Analysis**: Shows why each CV matches your requirements
+                    *Enter a job description above to get started!*
+                    """,
+                    elem_classes=["results-container"]
                 )
+            # Refresh Status Button
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh System Status", size="sm")
         # Event handlers
         search_btn.click(
             outputs=[status_display]
         )
+        # Clear text input when PDF is uploaded
         jd_file.change(
+            fn=lambda: "",
             outputs=[jd_text]
         )
+        # Clear file input when text is entered
+        jd_text.change(
+            fn=lambda x: None if x.strip() else None,
+            inputs=[jd_text],
+            outputs=[jd_file]
+        )
+        # Footer
         gr.Markdown("""
         ---
+        ## 🛠️ Technical Details
+        - **Vector Database**: ChromaDB (rebuilt on each restart)
+        - **Embedding Model**: SentenceTransformers all-MiniLM-L6-v2
+        - **Text Extraction**: pdfplumber + OCR fallback for scanned documents
+        - **CV Source**: Google Drive folder (automatically synced)
+        - **Search Algorithm**: Cosine similarity with chunk aggregation
+        ### 📞 Support
+        If no results appear, check that:
+        - Your Google Drive folder is public
+        - The folder contains PDF files
+        - Your API key is valid and has Drive API access
         """)
     return demo
 def main():
     """Main function to initialize and run the app"""
     logger.info("Starting CV Semantic Search application...")
+    # Initialize database at startup
+    if initialize_database():
+        logger.info("✅ Database initialization successful")
+    else:
+        logger.error("❌ Database initialization failed")
     # Create and launch interface
     demo = create_interface()