rbbist commited on
Commit
c37fd2f
Β·
verified Β·
1 Parent(s): 8c7a8ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -165
app.py CHANGED
@@ -2,279 +2,344 @@ import gradio as gr
2
  import os
3
  from semantic_search import CVSemanticSearch
4
  import logging
 
 
5
 
6
  # Set up logging
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger(__name__)
9
 
10
- # Initialize the semantic search system
11
- cv_search = CVSemanticSearch()
12
 
13
- def upload_cvs(files):
 
 
 
 
 
 
14
  """
15
- Handle CV uploads from Gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  Args:
18
- files: List of uploaded files from Gradio
19
 
20
  Returns:
21
- Status message
22
  """
23
- if not files:
24
- return "No files uploaded."
25
-
26
- successful = 0
27
- total = len(files)
28
-
29
- for file in files:
30
- try:
31
- # Read file content
32
- with open(file.name, 'rb') as f:
33
- file_content = f.read()
34
-
35
- # Get filename from path
36
- filename = os.path.basename(file.name)
37
-
38
- # Add to database
39
- if cv_search.add_cv_to_database(file_content, filename):
40
- successful += 1
41
- logger.info(f"Successfully uploaded: {filename}")
42
- else:
43
- logger.error(f"Failed to upload: {filename}")
44
-
45
- except Exception as e:
46
- logger.error(f"Error processing file {file.name}: {str(e)}")
47
 
48
- db_info = cv_search.get_database_info()
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- return f"""
51
- Upload Complete!
52
- βœ… Successfully processed: {successful}/{total} files
53
- πŸ“Š Database now contains: {db_info['unique_cvs']} CVs ({db_info['total_chunks']} chunks)
54
 
55
- CVs in database: {', '.join(db_info['cv_filenames'])}
56
- """
57
 
58
- def search_matching_cvs(job_description, num_results):
59
  """
60
  Search for CVs matching the job description
61
 
62
  Args:
63
- job_description: Job description text
 
64
  num_results: Number of results to return
65
 
66
  Returns:
67
  Formatted search results
68
  """
69
- if not job_description.strip():
70
- return "Please enter a job description."
 
 
 
 
 
 
 
 
71
 
72
  # Get database info
73
  db_info = cv_search.get_database_info()
74
 
75
  if db_info['unique_cvs'] == 0:
76
- return "No CVs in database. Please upload some CV PDFs first."
77
 
78
  # Perform search
79
  results = cv_search.search_cvs(job_description, top_k=num_results)
80
 
81
  if not results:
82
- return "No matching CVs found."
83
 
84
  # Format results
85
- output = f"🎯 **Top {len(results)} Matching CVs:**\n\n"
 
86
 
87
  for i, cv in enumerate(results, 1):
88
  similarity_percentage = cv['weighted_score'] * 100
89
 
 
 
 
 
 
 
 
 
 
 
90
  output += f"""
91
- **{i}. {cv['filename']}**
92
- - **Match Score**: {similarity_percentage:.1f}%
93
- - **Max Similarity**: {cv['max_similarity']*100:.1f}%
94
- - **Avg Similarity**: {cv['avg_similarity']*100:.1f}%
95
- - **Chunks Analyzed**: {cv['chunk_count']}
96
- - **Best Match Preview**: {cv['best_match_text']}
 
 
 
97
 
98
  ---
99
  """
100
 
101
  return output
102
 
103
- def get_database_status():
104
  """
105
- Get current database status
106
 
107
  Returns:
108
- Database information as formatted string
109
  """
 
 
 
 
 
110
  db_info = cv_search.get_database_info()
111
 
112
  if db_info['unique_cvs'] == 0:
113
- return "πŸ“ Database is empty. Upload some CV PDFs to get started!"
 
 
 
 
 
 
 
114
 
115
  return f"""
116
- πŸ“Š **Database Status:**
117
- - **Total CVs**: {db_info['unique_cvs']}
 
 
118
  - **Total Chunks**: {db_info['total_chunks']}
119
- - **CVs in Database**: {', '.join(db_info['cv_filenames'])}
120
- """
121
-
122
- def clear_database():
123
- """
124
- Clear the entire database
125
 
126
- Returns:
127
- Status message
128
  """
129
- if cv_search.clear_database():
130
- return "πŸ—‘οΈ Database cleared successfully!"
131
- else:
132
- return "❌ Error clearing database."
133
 
134
  # Create Gradio interface
135
  def create_interface():
136
  """Create and return the Gradio interface"""
137
 
138
  with gr.Blocks(
139
- title="CV Semantic Search",
140
  theme=gr.themes.Soft(),
141
  css="""
142
- .container { max-width: 1200px; margin: auto; }
143
- .upload-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 10px 0; }
144
- .search-section { background: #e8f5e8; padding: 20px; border-radius: 10px; margin: 10px 0; }
145
- .status-section { background: #fff3cd; padding: 15px; border-radius: 8px; margin: 10px 0; }
 
 
 
 
146
  """
147
  ) as demo:
148
 
149
  gr.Markdown("""
150
- # πŸ” CV Semantic Search System
151
 
152
- Upload CV PDFs and search for the best matches based on job descriptions using AI-powered semantic search.
 
 
 
 
 
153
  """)
154
 
 
155
  with gr.Row():
156
- with gr.Column(scale=1):
157
- # Upload Section
158
- with gr.Group():
159
- gr.Markdown("## πŸ“ Upload CVs")
160
-
161
- cv_files = gr.File(
162
- label="Upload CV PDFs",
163
- file_count="multiple",
164
- file_types=[".pdf"],
165
- elem_classes=["upload-section"]
166
- )
167
-
168
- upload_btn = gr.Button(
169
- "Upload CVs to Database",
170
- variant="primary",
171
- size="lg"
172
- )
173
-
174
- upload_output = gr.Markdown(
175
- "Upload CVs to build your searchable database.",
176
- elem_classes=["status-section"]
177
- )
178
-
179
- with gr.Column(scale=1):
180
- # Search Section
181
- with gr.Group():
182
- gr.Markdown("## 🎯 Search CVs")
183
-
184
- job_description = gr.Textbox(
185
  label="Job Description",
186
- placeholder="Enter the job description here...\n\nExample: Looking for a senior software engineer with 5+ years experience in Python, React, and cloud technologies. Strong background in microservices and API development required.",
187
- lines=6,
188
- elem_classes=["search-section"]
189
- )
190
-
191
- num_results = gr.Slider(
192
- label="Number of Results",
193
- minimum=1,
194
- maximum=10,
195
- value=5,
196
- step=1
197
  )
198
-
199
- search_btn = gr.Button(
200
- "Search Matching CVs",
201
- variant="secondary",
202
- size="lg"
 
203
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  # Search Results
206
  with gr.Row():
207
  search_output = gr.Markdown(
208
- "Enter a job description and click search to find matching CVs.",
209
- elem_classes=["search-section"]
 
 
 
 
 
 
 
210
  )
211
 
212
- # Database Management
213
  with gr.Row():
214
- with gr.Column(scale=2):
215
- status_output = gr.Markdown(
216
- get_database_status(),
217
- elem_classes=["status-section"]
218
- )
219
-
220
- with gr.Column(scale=1):
221
- with gr.Group():
222
- refresh_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
223
- clear_btn = gr.Button("πŸ—‘οΈ Clear Database", size="sm", variant="stop")
224
 
225
  # Event handlers
226
- upload_btn.click(
227
- fn=upload_cvs,
228
- inputs=[cv_files],
229
- outputs=[upload_output]
230
- ).then(
231
- fn=get_database_status,
232
- outputs=[status_output]
233
- )
234
-
235
  search_btn.click(
236
  fn=search_matching_cvs,
237
- inputs=[job_description, num_results],
238
  outputs=[search_output]
239
  )
240
 
241
  refresh_btn.click(
242
- fn=get_database_status,
243
- outputs=[status_output]
244
  )
245
 
246
- clear_btn.click(
247
- fn=clear_database,
248
- outputs=[status_output]
249
- ).then(
250
- fn=get_database_status,
251
- outputs=[status_output]
252
  )
253
 
254
- # Example usage
255
  gr.Markdown("""
256
- ## πŸ“ How to Use:
 
257
 
258
- 1. **Upload CVs**: Use the file upload component to add multiple PDF CVs to the database
259
- 2. **Enter Job Description**: Paste or type the job requirements you want to match
260
- 3. **Search**: Click search to find the top matching CVs based on semantic similarity
261
- 4. **Review Results**: See ranked CVs with similarity scores and preview text
262
 
263
- ### πŸ’‘ Tips for Better Results:
264
- - Include specific skills, technologies, and requirements in your job description
265
- - The more detailed your job description, the better the matching accuracy
266
- - The system analyzes semantic meaning, not just keyword matching
267
- - Upload multiple CVs for better comparison and ranking
 
 
 
 
 
 
268
  """)
269
 
270
  return demo
271
 
272
- # Main function to run the app
273
- if __name__ == "__main__":
 
 
 
 
 
 
 
274
  demo = create_interface()
275
  demo.launch(
276
  share=True, # Enable sharing for Hugging Face Spaces
277
  server_name="0.0.0.0", # Enable access from outside container
278
  server_port=7860, # Standard port for Hugging Face Spaces
279
  show_error=True
280
- )
 
 
 
 
2
  import os
3
  from semantic_search import CVSemanticSearch
4
  import logging
5
+ import PyPDF2
6
+ import io
7
 
8
  # Set up logging
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Google Drive folder URL - UPDATE THIS WITH YOUR ACTUAL FOLDER URL
13
+ GOOGLE_DRIVE_FOLDER_URL = "https://drive.google.com/drive/folders/XXXXXXXXXXXXXXXXX?usp=sharing"
14
 
15
+ # Global variable to store the search system
16
+ cv_search = None
17
+
18
+ def initialize_database():
19
+ """
20
+ Initialize the database by loading CVs from Google Drive folder
21
+ This runs once when the space starts
22
  """
23
+ global cv_search
24
+
25
+ logger.info("Initializing CV Semantic Search system...")
26
+ cv_search = CVSemanticSearch()
27
+
28
+ logger.info("Loading CVs from Google Drive folder...")
29
+ successful, total = cv_search.load_cvs_from_google_drive(GOOGLE_DRIVE_FOLDER_URL)
30
+
31
+ if successful > 0:
32
+ logger.info(f"Successfully loaded {successful}/{total} CVs into database")
33
+ return f"βœ… Database initialized with {successful}/{total} CVs"
34
+ else:
35
+ logger.error("Failed to load any CVs from Google Drive")
36
+ return "❌ Failed to load CVs from Google Drive. Check the folder URL and permissions."
37
+
38
+ def extract_text_from_jd_file(file) -> str:
39
+ """
40
+ Extract text from uploaded JD PDF file
41
 
42
  Args:
43
+ file: Gradio file object
44
 
45
  Returns:
46
+ Extracted text
47
  """
48
+ try:
49
+ if file is None:
50
+ return ""
51
+
52
+ with open(file.name, 'rb') as f:
53
+ pdf_content = f.read()
54
+
55
+ return cv_search.extract_text_from_pdf_bytes(pdf_content)
56
+
57
+ except Exception as e:
58
+ logger.error(f"Error extracting text from JD file: {str(e)}")
59
+ return ""
60
+
61
+ def process_job_description(jd_text, jd_file):
62
+ """
63
+ Process job description from either text input or PDF file
 
 
 
 
 
 
 
 
64
 
65
+ Args:
66
+ jd_text: Job description as text
67
+ jd_file: Job description as PDF file
68
+
69
+ Returns:
70
+ Processed job description text
71
+ """
72
+ # Priority: PDF file over text input
73
+ if jd_file is not None:
74
+ extracted_text = extract_text_from_jd_file(jd_file)
75
+ if extracted_text.strip():
76
+ return extracted_text.strip()
77
 
78
+ # Fallback to text input
79
+ if jd_text and jd_text.strip():
80
+ return jd_text.strip()
 
81
 
82
+ return ""
 
83
 
84
+ def search_matching_cvs(jd_text, jd_file, num_results):
85
  """
86
  Search for CVs matching the job description
87
 
88
  Args:
89
+ jd_text: Job description as text
90
+ jd_file: Job description as PDF file
91
  num_results: Number of results to return
92
 
93
  Returns:
94
  Formatted search results
95
  """
96
+ global cv_search
97
+
98
+ if cv_search is None:
99
+ return "❌ System not initialized. Please refresh the page."
100
+
101
+ # Process job description
102
+ job_description = process_job_description(jd_text, jd_file)
103
+
104
+ if not job_description:
105
+ return "❌ Please provide a job description either as text or upload a PDF file."
106
 
107
  # Get database info
108
  db_info = cv_search.get_database_info()
109
 
110
  if db_info['unique_cvs'] == 0:
111
+ return "❌ No CVs in database. Please check the Google Drive folder configuration."
112
 
113
  # Perform search
114
  results = cv_search.search_cvs(job_description, top_k=num_results)
115
 
116
  if not results:
117
+ return "❌ No matching CVs found. Try adjusting your job description."
118
 
119
  # Format results
120
+ output = f"## 🎯 Top {len(results)} Matching CVs\n\n"
121
+ output += f"**Job Description Preview**: {job_description[:150]}{'...' if len(job_description) > 150 else ''}\n\n"
122
 
123
  for i, cv in enumerate(results, 1):
124
  similarity_percentage = cv['weighted_score'] * 100
125
 
126
+ # Determine match quality
127
+ if similarity_percentage >= 80:
128
+ match_quality = "🟒 Excellent Match"
129
+ elif similarity_percentage >= 65:
130
+ match_quality = "🟑 Good Match"
131
+ elif similarity_percentage >= 50:
132
+ match_quality = "🟠 Fair Match"
133
+ else:
134
+ match_quality = "πŸ”΄ Weak Match"
135
+
136
  output += f"""
137
+ ### {i}. {cv['filename']} - {match_quality}
138
+
139
+ **Overall Score**: {similarity_percentage:.1f}%
140
+ - **Best Match Score**: {cv['max_similarity']*100:.1f}%
141
+ - **Average Score**: {cv['avg_similarity']*100:.1f}%
142
+ - **Sections Analyzed**: {cv['chunk_count']} parts
143
+
144
+ **Best Matching Content**:
145
+ "{cv['best_match_text']}"
146
 
147
  ---
148
  """
149
 
150
  return output
151
 
152
+ def get_system_status():
153
  """
154
+ Get current system status
155
 
156
  Returns:
157
+ System information as formatted string
158
  """
159
+ global cv_search
160
+
161
+ if cv_search is None:
162
+ return "❌ System not initialized"
163
+
164
  db_info = cv_search.get_database_info()
165
 
166
  if db_info['unique_cvs'] == 0:
167
+ return """
168
+ ⚠️ **System Status**: No CVs loaded
169
+
170
+ Please check:
171
+ - Google Drive folder URL is correct
172
+ - Folder is public and accessible
173
+ - Folder contains PDF files
174
+ """
175
 
176
  return f"""
177
+ βœ… **System Status**: Ready
178
+
179
+ πŸ“Š **Database Info**:
180
+ - **Total CVs Loaded**: {db_info['unique_cvs']}
181
  - **Total Chunks**: {db_info['total_chunks']}
182
+ - **Average Chunks per CV**: {db_info['total_chunks'] / db_info['unique_cvs']:.1f}
 
 
 
 
 
183
 
184
+ πŸ“ **Loaded CVs**: {', '.join(db_info['cv_filenames'][:5])}{'...' if len(db_info['cv_filenames']) > 5 else ''}
 
185
  """
 
 
 
 
186
 
187
  # Create Gradio interface
188
  def create_interface():
189
  """Create and return the Gradio interface"""
190
 
191
  with gr.Blocks(
192
+ title="CV Semantic Search - Auto-loaded from Google Drive",
193
  theme=gr.themes.Soft(),
194
  css="""
195
+ .container { max-width: 1000px; margin: auto; }
196
+ .search-section { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
197
+ color: white; padding: 25px; border-radius: 15px; margin: 15px 0; }
198
+ .status-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin: 15px 0;
199
+ border-left: 5px solid #007bff; }
200
+ .results-section { background: #ffffff; padding: 20px; border-radius: 10px;
201
+ border: 1px solid #dee2e6; margin: 15px 0; }
202
+ .header { text-align: center; padding: 20px; }
203
  """
204
  ) as demo:
205
 
206
  gr.Markdown("""
207
+ <div class="header">
208
 
209
+ # πŸš€ CV Semantic Search System
210
+ ### AI-Powered Resume Matching with Auto-loaded Database
211
+
212
+ *CVs are automatically loaded from Google Drive when the space starts*
213
+
214
+ </div>
215
  """)
216
 
217
+ # System Status
218
  with gr.Row():
219
+ status_display = gr.Markdown(
220
+ get_system_status(),
221
+ elem_classes=["status-section"]
222
+ )
223
+
224
+ # Main Search Interface
225
+ with gr.Row():
226
+ with gr.Column():
227
+ gr.Markdown("## πŸ“‹ Enter Job Description", elem_classes=["search-section"])
228
+
229
+ with gr.Tab("πŸ“ Text Input"):
230
+ jd_text = gr.Textbox(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  label="Job Description",
232
+ placeholder="""Enter your job description here...
233
+
234
+ Example:
235
+ We are looking for a Senior Software Engineer with:
236
+ - 5+ years of experience in Python and JavaScript
237
+ - Strong background in machine learning and data science
238
+ - Experience with cloud platforms (AWS, GCP)
239
+ - Knowledge of microservices architecture
240
+ - Bachelor's degree in Computer Science or related field""",
241
+ lines=8,
242
+ max_lines=15
243
  )
244
+
245
+ with gr.Tab("πŸ“„ PDF Upload"):
246
+ jd_file = gr.File(
247
+ label="Upload Job Description PDF",
248
+ file_types=[".pdf"],
249
+ file_count="single"
250
  )
251
+
252
+ num_results = gr.Slider(
253
+ label="Number of Top CVs to Return",
254
+ minimum=1,
255
+ maximum=10,
256
+ value=5,
257
+ step=1
258
+ )
259
+
260
+ search_btn = gr.Button(
261
+ "πŸ” Find Matching CVs",
262
+ variant="primary",
263
+ size="lg"
264
+ )
265
 
266
  # Search Results
267
  with gr.Row():
268
  search_output = gr.Markdown(
269
+ """
270
+ ## πŸ“‹ Instructions:
271
+ 1. **Enter Job Description**: Use text input or upload a PDF
272
+ 2. **Click Search**: Find the best matching CVs from the database
273
+ 3. **Review Results**: See ranked CVs with similarity scores
274
+
275
+ The system automatically analyzes semantic meaning, not just keywords!
276
+ """,
277
+ elem_classes=["results-section"]
278
  )
279
 
280
+ # Refresh button for status
281
  with gr.Row():
282
+ refresh_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
 
 
 
 
 
 
 
 
 
283
 
284
  # Event handlers
 
 
 
 
 
 
 
 
 
285
  search_btn.click(
286
  fn=search_matching_cvs,
287
+ inputs=[jd_text, jd_file, num_results],
288
  outputs=[search_output]
289
  )
290
 
291
  refresh_btn.click(
292
+ fn=get_system_status,
293
+ outputs=[status_display]
294
  )
295
 
296
+ # Clear inputs when switching tabs
297
+ jd_file.change(
298
+ fn=lambda: "", # Clear text when file is uploaded
299
+ outputs=[jd_text]
 
 
300
  )
301
 
302
+ # Footer information
303
  gr.Markdown("""
304
+ ---
305
+ ## ℹ️ System Information
306
 
307
+ - **Model**: Sentence Transformers (all-MiniLM-L6-v2)
308
+ - **Database**: ChromaDB (in-memory, rebuilt on restart)
309
+ - **CV Source**: Google Drive folder (auto-loaded)
310
+ - **Search Method**: Semantic similarity matching
311
 
312
+ ### 🎯 How It Works:
313
+ 1. CVs are automatically downloaded from Google Drive and processed into text chunks
314
+ 2. Each chunk is converted to a vector using AI embeddings
315
+ 3. Your job description is compared against all CV chunks using semantic similarity
316
+ 4. Results are ranked by relevance, not just keyword matching
317
+
318
+ ### πŸ’‘ Pro Tips:
319
+ - Be specific about required skills and experience
320
+ - Include both technical and soft skill requirements
321
+ - Mention specific tools, technologies, or frameworks
322
+ - The more detailed your JD, the better the matching accuracy
323
  """)
324
 
325
  return demo
326
 
327
+ def main():
328
+ """Main function to initialize and run the app"""
329
+
330
+ # Initialize database at startup
331
+ logger.info("Starting CV Semantic Search application...")
332
+ init_status = initialize_database()
333
+ logger.info(f"Initialization result: {init_status}")
334
+
335
+ # Create and launch interface
336
  demo = create_interface()
337
  demo.launch(
338
  share=True, # Enable sharing for Hugging Face Spaces
339
  server_name="0.0.0.0", # Enable access from outside container
340
  server_port=7860, # Standard port for Hugging Face Spaces
341
  show_error=True
342
+ )
343
+
344
+ if __name__ == "__main__":
345
+ main()