rbbist commited on
Commit
5e17ee3
Β·
verified Β·
1 Parent(s): 390b438

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -170
app.py CHANGED
@@ -2,61 +2,46 @@ import gradio as gr
2
  import os
3
  from semantic_search import CVSemanticSearch
4
  import logging
5
- import PyPDF2
6
- import io
7
 
8
  # Set up logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- # Google Drive folder URL - UPDATE THIS WITH YOUR ACTUAL FOLDER URL
13
- GOOGLE_DRIVE_FOLDER_URL = "https://drive.google.com/drive/folders/1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN"
 
14
 
15
  # Global variable to store the search system
16
  cv_search = None
 
17
 
18
  def initialize_database():
19
  """
20
  Initialize the database by loading CVs from Google Drive folder
21
  This runs once when the space starts
22
  """
23
- global cv_search
24
-
25
- logger.info("Initializing CV Semantic Search system...")
26
- cv_search = CVSemanticSearch()
27
-
28
- logger.info("Loading CVs from Google Drive folder...")
29
- successful, total = cv_search.load_cvs_from_google_drive(GOOGLE_DRIVE_FOLDER_URL)
30
-
31
- if successful > 0:
32
- logger.info(f"Successfully loaded {successful}/{total} CVs into database")
33
- return f"βœ… Database initialized with {successful}/{total} CVs"
34
- else:
35
- logger.error("Failed to load any CVs from Google Drive")
36
- return "❌ Failed to load CVs from Google Drive. Check the folder URL and permissions."
37
-
38
- def extract_text_from_jd_file(file) -> str:
39
- """
40
- Extract text from uploaded JD PDF file
41
 
42
- Args:
43
- file: Gradio file object
44
-
45
- Returns:
46
- Extracted text
47
- """
48
  try:
49
- if file is None:
50
- return ""
51
-
52
- with open(file.name, 'rb') as f:
53
- pdf_content = f.read()
54
 
55
- return cv_search.extract_text_from_pdf_bytes(pdf_content)
 
56
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
- logger.error(f"Error extracting text from JD file: {str(e)}")
59
- return ""
 
60
 
61
  def process_job_description(jd_text, jd_file):
62
  """
@@ -71,9 +56,15 @@ def process_job_description(jd_text, jd_file):
71
  """
72
  # Priority: PDF file over text input
73
  if jd_file is not None:
74
- extracted_text = extract_text_from_jd_file(jd_file)
75
- if extracted_text.strip():
76
- return extracted_text.strip()
 
 
 
 
 
 
77
 
78
  # Fallback to text input
79
  if jd_text and jd_text.strip():
@@ -96,7 +87,7 @@ def search_matching_cvs(jd_text, jd_file, num_results):
96
  global cv_search
97
 
98
  if cv_search is None:
99
- return "❌ System not initialized. Please refresh the page."
100
 
101
  # Process job description
102
  job_description = process_job_description(jd_text, jd_file)
@@ -108,41 +99,58 @@ def search_matching_cvs(jd_text, jd_file, num_results):
108
  db_info = cv_search.get_database_info()
109
 
110
  if db_info['unique_cvs'] == 0:
111
- return "❌ No CVs in database. Please check the Google Drive folder configuration."
112
 
113
  # Perform search
114
  results = cv_search.search_cvs(job_description, top_k=num_results)
115
 
116
  if not results:
117
- return "❌ No matching CVs found. Try adjusting your job description."
118
 
119
  # Format results
120
- output = f"## 🎯 Top {len(results)} Matching CVs\n\n"
121
- output += f"**Job Description Preview**: {job_description[:150]}{'...' if len(job_description) > 150 else ''}\n\n"
 
 
 
 
 
 
 
122
 
123
  for i, cv in enumerate(results, 1):
124
  similarity_percentage = cv['weighted_score'] * 100
125
 
126
- # Determine match quality
127
  if similarity_percentage >= 80:
128
  match_quality = "🟒 Excellent Match"
 
129
  elif similarity_percentage >= 65:
130
  match_quality = "🟑 Good Match"
 
131
  elif similarity_percentage >= 50:
132
  match_quality = "🟠 Fair Match"
 
133
  else:
134
  match_quality = "πŸ”΄ Weak Match"
 
135
 
136
  output += f"""
137
- ### {i}. {cv['filename']} - {match_quality}
 
 
138
 
139
- **Overall Score**: {similarity_percentage:.1f}%
140
- - **Best Match Score**: {cv['max_similarity']*100:.1f}%
141
- - **Average Score**: {cv['avg_similarity']*100:.1f}%
142
- - **Sections Analyzed**: {cv['chunk_count']} parts
143
 
144
- **Best Matching Content**:
145
- "{cv['best_match_text']}"
 
 
 
 
 
 
 
146
 
147
  ---
148
  """
@@ -156,32 +164,46 @@ def get_system_status():
156
  Returns:
157
  System information as formatted string
158
  """
159
- global cv_search
160
 
161
  if cv_search is None:
162
- return "❌ System not initialized"
 
 
 
 
 
 
 
 
 
 
163
 
164
  db_info = cv_search.get_database_info()
165
 
166
  if db_info['unique_cvs'] == 0:
167
- return """
168
- ⚠️ **System Status**: No CVs loaded
 
 
169
 
170
- Please check:
171
- - Google Drive folder URL is correct
172
- - Folder is public and accessible
173
- - Folder contains PDF files
174
  """
175
 
176
  return f"""
177
- βœ… **System Status**: Ready
 
 
 
 
 
178
 
179
- πŸ“Š **Database Info**:
180
- - **Total CVs Loaded**: {db_info['unique_cvs']}
181
- - **Total Chunks**: {db_info['total_chunks']}
182
- - **Average Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
183
 
184
- πŸ“ **Loaded CVs**: {', '.join(db_info['cv_filenames'][:5])}{'...' if len(db_info['cv_filenames']) > 5 else ''}
185
  """
186
 
187
  # Create Gradio interface
@@ -192,94 +214,124 @@ def create_interface():
192
  title="CV Semantic Search - Auto-loaded from Google Drive",
193
  theme=gr.themes.Soft(),
194
  css="""
195
- .container { max-width: 1000px; margin: auto; }
196
- .search-section { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
197
- color: white; padding: 25px; border-radius: 15px; margin: 15px 0; }
198
- .status-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 15px 0;
199
- border-left: 5px solid #007bff; }
200
- .results-section { background: #ffffff; padding: 20px; border-radius: 10px;
201
- border: 1px solid #dee2e6; margin: 15px 0; }
202
- .header { text-align: center; padding: 20px; }
 
 
 
 
 
 
 
 
 
203
  """
204
  ) as demo:
205
 
206
- gr.Markdown("""
207
- <div class="header">
208
-
209
- # πŸš€ CV Semantic Search System
210
- ### AI-Powered Resume Matching with Auto-loaded Database
211
-
212
- *CVs are automatically loaded from Google Drive when the space starts*
213
-
214
- </div>
215
- """)
216
-
217
- # System Status
218
- with gr.Row():
219
- status_display = gr.Markdown(
220
- get_system_status(),
221
- elem_classes=["status-section"]
222
- )
223
-
224
- # Main Search Interface
225
- with gr.Row():
226
- with gr.Column():
227
- gr.Markdown("## πŸ“‹ Enter Job Description", elem_classes=["search-section"])
228
-
229
- with gr.Tab("πŸ“ Text Input"):
230
- jd_text = gr.Textbox(
231
- label="Job Description",
232
- placeholder="""Enter your job description here...
 
 
233
 
234
  Example:
235
- We are looking for a Senior Software Engineer with:
236
- - 5+ years of experience in Python and JavaScript
237
- - Strong background in machine learning and data science
238
- - Experience with cloud platforms (AWS, GCP)
239
- - Knowledge of microservices architecture
240
- - Bachelor's degree in Computer Science or related field""",
241
- lines=8,
242
- max_lines=15
243
- )
244
-
245
- with gr.Tab("πŸ“„ PDF Upload"):
246
- jd_file = gr.File(
247
- label="Upload Job Description PDF",
248
- file_types=[".pdf"],
249
- file_count="single"
250
- )
251
-
252
- num_results = gr.Slider(
253
- label="Number of Top CVs to Return",
254
- minimum=1,
255
- maximum=10,
256
- value=5,
257
- step=1
258
- )
259
-
260
- search_btn = gr.Button(
261
- "πŸ” Find Matching CVs",
262
- variant="primary",
263
- size="lg"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  )
265
-
266
- # Search Results
267
- with gr.Row():
268
- search_output = gr.Markdown(
269
- """
270
- ## πŸ“‹ Instructions:
271
- 1. **Enter Job Description**: Use text input or upload a PDF
272
- 2. **Click Search**: Find the best matching CVs from the database
273
- 3. **Review Results**: See ranked CVs with similarity scores
274
-
275
- The system automatically analyzes semantic meaning, not just keywords!
276
- """,
277
- elem_classes=["results-section"]
278
- )
279
-
280
- # Refresh button for status
281
- with gr.Row():
282
- refresh_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
283
 
284
  # Event handlers
285
  search_btn.click(
@@ -293,33 +345,35 @@ We are looking for a Senior Software Engineer with:
293
  outputs=[status_display]
294
  )
295
 
296
- # Clear inputs when switching tabs
297
  jd_file.change(
298
- fn=lambda: "", # Clear text when file is uploaded
299
  outputs=[jd_text]
300
  )
301
 
302
- # Footer information
 
 
 
 
 
 
 
303
  gr.Markdown("""
304
  ---
305
- ## ℹ️ System Information
306
 
307
- - **Model**: Sentence Transformers (all-MiniLM-L6-v2)
308
- - **Database**: ChromaDB (in-memory, rebuilt on restart)
309
- - **CV Source**: Google Drive folder (auto-loaded)
310
- - **Search Method**: Semantic similarity matching
 
311
 
312
- ### 🎯 How It Works:
313
- 1. CVs are automatically downloaded from Google Drive and processed into text chunks
314
- 2. Each chunk is converted to a vector using AI embeddings
315
- 3. Your job description is compared against all CV chunks using semantic similarity
316
- 4. Results are ranked by relevance, not just keyword matching
317
-
318
- ### πŸ’‘ Pro Tips:
319
- - Be specific about required skills and experience
320
- - Include both technical and soft skill requirements
321
- - Mention specific tools, technologies, or frameworks
322
- - The more detailed your JD, the better the matching accuracy
323
  """)
324
 
325
  return demo
@@ -327,10 +381,13 @@ We are looking for a Senior Software Engineer with:
327
  def main():
328
  """Main function to initialize and run the app"""
329
 
330
- # Initialize database at startup
331
  logger.info("Starting CV Semantic Search application...")
332
- init_status = initialize_database()
333
- logger.info(f"Initialization result: {init_status}")
 
 
 
 
334
 
335
  # Create and launch interface
336
  demo = create_interface()
 
2
  import os
3
  from semantic_search import CVSemanticSearch
4
  import logging
 
 
5
 
6
  # Set up logging
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger(__name__)
9
 
10
+ # Google Drive Configuration - UPDATE THESE VALUES
11
+ FOLDER_ID = "1j1faOlXxoYfPLdzDfGvDbtkENsRoDxXN" # Replace with your folder ID
12
+ API_KEY = os.getenv("YOUR_GOOGLE_DRIVE_API_KEY") # Replace with your API key
13
 
14
  # Global variable to store the search system
15
  cv_search = None
16
+ initialization_status = "Initializing..."
17
 
18
  def initialize_database():
19
  """
20
  Initialize the database by loading CVs from Google Drive folder
21
  This runs once when the space starts
22
  """
23
+ global cv_search, initialization_status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
 
25
  try:
26
+ logger.info("Initializing CV Semantic Search system...")
27
+ cv_search = CVSemanticSearch()
 
 
 
28
 
29
+ logger.info("Loading CVs from Google Drive folder...")
30
+ successful, total = cv_search.load_cvs_from_google_drive(FOLDER_ID, API_KEY)
31
 
32
+ if successful > 0:
33
+ initialization_status = f"βœ… Successfully loaded {successful}/{total} CVs into database"
34
+ logger.info(initialization_status)
35
+ return True
36
+ else:
37
+ initialization_status = "❌ Failed to load any CVs from Google Drive. Check API key and folder ID."
38
+ logger.error(initialization_status)
39
+ return False
40
+
41
  except Exception as e:
42
+ initialization_status = f"❌ Error during initialization: {str(e)}"
43
+ logger.error(initialization_status)
44
+ return False
45
 
46
  def process_job_description(jd_text, jd_file):
47
  """
 
56
  """
57
  # Priority: PDF file over text input
58
  if jd_file is not None:
59
+ try:
60
+ with open(jd_file.name, 'rb') as f:
61
+ pdf_content = f.read()
62
+
63
+ extracted_text = cv_search.extract_text_from_pdf_bytes(pdf_content)
64
+ if extracted_text.strip():
65
+ return extracted_text.strip()
66
+ except Exception as e:
67
+ logger.error(f"Error processing JD PDF: {str(e)}")
68
 
69
  # Fallback to text input
70
  if jd_text and jd_text.strip():
 
87
  global cv_search
88
 
89
  if cv_search is None:
90
+ return f"❌ System not initialized properly.\n\n{initialization_status}\n\nPlease refresh the page or check the configuration."
91
 
92
  # Process job description
93
  job_description = process_job_description(jd_text, jd_file)
 
99
  db_info = cv_search.get_database_info()
100
 
101
  if db_info['unique_cvs'] == 0:
102
+ return f"❌ No CVs in database.\n\n{initialization_status}"
103
 
104
  # Perform search
105
  results = cv_search.search_cvs(job_description, top_k=num_results)
106
 
107
  if not results:
108
+ return "❌ No matching CVs found. Try using different keywords or requirements in your job description."
109
 
110
  # Format results
111
+ jd_preview = job_description[:150] + "..." if len(job_description) > 150 else job_description
112
+
113
+ output = f"""## 🎯 Top {len(results)} Matching CVs
114
+
115
+ **Job Description**: {jd_preview}
116
+
117
+ **Search Results**:
118
+
119
+ """
120
 
121
  for i, cv in enumerate(results, 1):
122
  similarity_percentage = cv['weighted_score'] * 100
123
 
124
+ # Determine match quality and emoji
125
  if similarity_percentage >= 80:
126
  match_quality = "🟒 Excellent Match"
127
+ quality_color = "#28a745"
128
  elif similarity_percentage >= 65:
129
  match_quality = "🟑 Good Match"
130
+ quality_color = "#ffc107"
131
  elif similarity_percentage >= 50:
132
  match_quality = "🟠 Fair Match"
133
+ quality_color = "#fd7e14"
134
  else:
135
  match_quality = "πŸ”΄ Weak Match"
136
+ quality_color = "#dc3545"
137
 
138
  output += f"""
139
+ ### {i}. **{cv['filename']}**
140
+
141
+ <div style="background: linear-gradient(90deg, {quality_color}22, transparent); padding: 15px; border-radius: 8px; border-left: 4px solid {quality_color};">
142
 
143
+ **{match_quality}** - **{similarity_percentage:.1f}% Overall Match**
 
 
 
144
 
145
+ πŸ“Š **Detailed Scores:**
146
+ - Best Section Match: {cv['max_similarity']*100:.1f}%
147
+ - Average Match: {cv['avg_similarity']*100:.1f}%
148
+ - CV Sections Analyzed: {cv['chunk_count']}
149
+
150
+ πŸ’‘ **Why This CV Matches:**
151
+ *"{cv['best_match_text']}"*
152
+
153
+ </div>
154
 
155
  ---
156
  """
 
164
  Returns:
165
  System information as formatted string
166
  """
167
+ global cv_search, initialization_status
168
 
169
  if cv_search is None:
170
+ return f"""
171
+ ## ⚠️ System Status: Not Ready
172
+
173
+ {initialization_status}
174
+
175
+ **Possible Issues:**
176
+ - Invalid Google Drive API key
177
+ - Incorrect folder ID
178
+ - Folder is not public
179
+ - No PDF files in the folder
180
+ """
181
 
182
  db_info = cv_search.get_database_info()
183
 
184
  if db_info['unique_cvs'] == 0:
185
+ return f"""
186
+ ## ⚠️ System Status: No CVs Loaded
187
+
188
+ {initialization_status}
189
 
190
+ **Please Check:**
191
+ - Google Drive folder contains PDF files
192
+ - Folder is publicly accessible
193
+ - API key has proper permissions
194
  """
195
 
196
  return f"""
197
+ ## βœ… System Status: Ready for Search
198
+
199
+ πŸ“Š **Database Statistics:**
200
+ - **CVs Loaded**: {db_info['unique_cvs']} resumes
201
+ - **Text Chunks**: {db_info['total_chunks']} searchable segments
202
+ - **Avg Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
203
 
204
+ πŸ€– **AI Model**: Sentence Transformers (all-MiniLM-L6-v2)
 
 
 
205
 
206
+ πŸ“ **Sample CVs**: {', '.join(db_info['cv_filenames'][:3])}{'...' if len(db_info['cv_filenames']) > 3 else ''}
207
  """
208
 
209
  # Create Gradio interface
 
214
  title="CV Semantic Search - Auto-loaded from Google Drive",
215
  theme=gr.themes.Soft(),
216
  css="""
217
+ .main-container { max-width: 1200px; margin: auto; padding: 20px; }
218
+ .search-container {
219
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
220
+ color: white; padding: 30px; border-radius: 20px; margin: 20px 0;
221
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
222
+ }
223
+ .status-container {
224
+ background: #f8f9fa; padding: 25px; border-radius: 15px; margin: 20px 0;
225
+ border-left: 5px solid #007bff; box-shadow: 0 5px 15px rgba(0,0,0,0.1);
226
+ }
227
+ .results-container {
228
+ background: #ffffff; padding: 25px; border-radius: 15px;
229
+ border: 1px solid #dee2e6; margin: 20px 0; box-shadow: 0 5px 15px rgba(0,0,0,0.1);
230
+ }
231
+ .header { text-align: center; padding: 30px; background: linear-gradient(135deg, #74b9ff, #0984e3);
232
+ color: white; margin: -20px -20px 20px -20px; border-radius: 15px 15px 0 0; }
233
+ .tab-content { padding: 15px; }
234
  """
235
  ) as demo:
236
 
237
+ with gr.Column(elem_classes=["main-container"]):
238
+
239
+ gr.Markdown("""
240
+ <div class="header">
241
+
242
+ # πŸš€ CV Semantic Search System
243
+ ## AI-Powered Resume Matching
244
+ ### *Automatically synced with Google Drive*
245
+
246
+ </div>
247
+ """)
248
+
249
+ # System Status Display
250
+ with gr.Row():
251
+ status_display = gr.Markdown(
252
+ get_system_status(),
253
+ elem_classes=["status-container"]
254
+ )
255
+
256
+ # Main Search Interface
257
+ with gr.Row():
258
+ with gr.Column():
259
+ with gr.Group(elem_classes=["search-container"]):
260
+ gr.Markdown("## πŸ“‹ Job Description Input")
261
+
262
+ with gr.Tab("πŸ“ Text Input") as text_tab:
263
+ jd_text = gr.Textbox(
264
+ label="Paste Job Description",
265
+ placeholder="""Paste your job description here...
266
 
267
  Example:
268
+ Senior Software Engineer Position
269
+
270
+ Requirements:
271
+ β€’ 5+ years of experience in Python, JavaScript, and React
272
+ β€’ Strong background in machine learning and AI
273
+ β€’ Experience with cloud platforms (AWS, Azure, GCP)
274
+ β€’ Knowledge of microservices and API development
275
+ β€’ Bachelor's degree in Computer Science or related field
276
+ β€’ Excellent problem-solving and communication skills
277
+
278
+ Responsibilities:
279
+ β€’ Design and develop scalable web applications
280
+ β€’ Lead technical projects and mentor junior developers
281
+ β€’ Collaborate with cross-functional teams
282
+ β€’ Implement best practices for code quality and testing""",
283
+ lines=12,
284
+ max_lines=20,
285
+ elem_classes=["tab-content"]
286
+ )
287
+
288
+ with gr.Tab("πŸ“„ PDF Upload") as pdf_tab:
289
+ jd_file = gr.File(
290
+ label="Upload Job Description PDF",
291
+ file_types=[".pdf"],
292
+ file_count="single",
293
+ elem_classes=["tab-content"]
294
+ )
295
+
296
+ with gr.Row():
297
+ num_results = gr.Slider(
298
+ label="Number of Top CVs to Return",
299
+ minimum=1,
300
+ maximum=10,
301
+ value=5,
302
+ step=1
303
+ )
304
+
305
+ search_btn = gr.Button(
306
+ "πŸ” Find Best Matching CVs",
307
+ variant="primary",
308
+ size="lg"
309
+ )
310
+
311
+ # Search Results
312
+ with gr.Row():
313
+ search_output = gr.Markdown(
314
+ """
315
+ ## πŸ“‹ How to Use This System:
316
+
317
+ 1. **Enter Job Requirements**: Use the text box or upload a PDF with your job description
318
+ 2. **Click Search**: The AI will analyze semantic meaning and find the best matches
319
+ 3. **Review Results**: See ranked CVs with detailed similarity scores and explanations
320
+
321
+ ### 🎯 What Makes This Special:
322
+ - **Semantic Understanding**: Finds relevant CVs even if they don't use exact keywords
323
+ - **Automatic Sync**: CVs are always up-to-date from your Google Drive folder
324
+ - **Smart Ranking**: Combines multiple similarity metrics for accurate results
325
+ - **Detailed Analysis**: Shows why each CV matches your requirements
326
+
327
+ *Enter a job description above to get started!*
328
+ """,
329
+ elem_classes=["results-container"]
330
  )
331
+
332
+ # Refresh Status Button
333
+ with gr.Row():
334
+ refresh_btn = gr.Button("πŸ”„ Refresh System Status", size="sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  # Event handlers
337
  search_btn.click(
 
345
  outputs=[status_display]
346
  )
347
 
348
+ # Clear text input when PDF is uploaded
349
  jd_file.change(
350
+ fn=lambda: "",
351
  outputs=[jd_text]
352
  )
353
 
354
+ # Clear file input when text is entered
355
+ jd_text.change(
356
+ fn=lambda x: None if x.strip() else None,
357
+ inputs=[jd_text],
358
+ outputs=[jd_file]
359
+ )
360
+
361
+ # Footer
362
  gr.Markdown("""
363
  ---
364
+ ## πŸ› οΈ Technical Details
365
 
366
+ - **Vector Database**: ChromaDB (rebuilt on each restart)
367
+ - **Embedding Model**: SentenceTransformers all-MiniLM-L6-v2
368
+ - **Text Extraction**: pdfplumber + OCR fallback for scanned documents
369
+ - **CV Source**: Google Drive folder (automatically synced)
370
+ - **Search Algorithm**: Cosine similarity with chunk aggregation
371
 
372
+ ### πŸ“ž Support
373
+ If no results appear, check that:
374
+ - Your Google Drive folder is public
375
+ - The folder contains PDF files
376
+ - Your API key is valid and has Drive API access
 
 
 
 
 
 
377
  """)
378
 
379
  return demo
 
381
  def main():
382
  """Main function to initialize and run the app"""
383
 
 
384
  logger.info("Starting CV Semantic Search application...")
385
+
386
+ # Initialize database at startup
387
+ if initialize_database():
388
+ logger.info("βœ… Database initialization successful")
389
+ else:
390
+ logger.error("❌ Database initialization failed")
391
 
392
  # Create and launch interface
393
  demo = create_interface()