csAhmad commited on
Commit
510b721
·
verified ·
1 Parent(s): 0455608

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +509 -136
app.py CHANGED
@@ -1,162 +1,535 @@
1
- import gradio as gr
2
- import zipfile
3
  import os
 
 
 
4
  import pandas as pd
5
- from sentence_transformers import SentenceTransformer
6
- from sklearn.metrics.pairwise import cosine_similarity
7
- from pypdf import PdfReader
8
- import docx
9
-
10
- # -------------------------
11
- # MODEL
12
- # -------------------------
13
- model = SentenceTransformer("csAhmad/zoraiz-model")
14
 
15
- EXTRACT_PATH = "temp/extracted"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
- # -------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # TEXT EXTRACTION
20
- # -------------------------
21
- def extract_text(file_path):
22
- path = file_path.lower()
23
-
24
  try:
25
- if path.endswith(".pdf"):
26
- reader = PdfReader(file_path)
27
- return " ".join([p.extract_text() or "" for p in reader.pages])
28
-
29
- elif path.endswith(".docx"):
30
- doc = docx.Document(file_path)
31
- return "\n".join([para.text for para in doc.paragraphs])
32
-
33
- except:
34
- return ""
35
-
36
- return ""
37
-
38
-
39
- # -------------------------
40
- # SIMPLE CV FIELD EXTRACTOR (replace with LLM later)
41
- # -------------------------
42
- def extract_cv_fields(text):
43
- # ⚠️ placeholder logic (safe for HF Spaces demo)
44
- lines = text.split("\n")
45
-
46
- return {
47
- "Name (Age)": lines[0] if len(lines) > 0 else "",
48
- "Contact": "",
49
- "Current Job": "",
50
- "Qualification": "",
51
- "Experience": "",
52
- "Publications": "",
53
- "Citation": "",
54
- "H-index": "",
55
- "Nationality": "",
56
- "Other Achievements": "",
57
- "Area": "",
58
- "Comments": ""
59
- }
60
-
61
-
62
- # -------------------------
63
- # MAIN FUNCTION
64
- # -------------------------
65
- def process_zip(zip_file, jd_text):
66
-
67
- if zip_file is None or jd_text.strip() == "":
68
- raise gr.Error("Please upload ZIP and enter Job Description.")
69
-
70
- # clean folder
71
- if os.path.exists(EXTRACT_PATH):
72
- for root, _, files in os.walk(EXTRACT_PATH):
73
- for f in files:
74
  try:
75
- os.remove(os.path.join(root, f))
76
- except:
 
 
77
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- os.makedirs(EXTRACT_PATH, exist_ok=True)
80
-
81
- zip_path = zip_file.name
82
 
83
- # extract zip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  try:
85
- with zipfile.ZipFile(zip_path, "r") as zip_ref:
86
- zip_ref.extractall(EXTRACT_PATH)
87
  except zipfile.BadZipFile:
88
- raise gr.Error("Invalid ZIP file.")
89
-
90
- # JD embedding
91
- jd_embedding = model.encode(jd_text)
92
 
93
- results = []
 
 
94
 
95
- # scan CVs
96
- for root, _, files in os.walk(EXTRACT_PATH):
97
- for file in files:
98
- file_path = os.path.join(root, file)
99
-
100
- text = extract_text(file_path)
101
-
102
- if not text.strip():
103
  continue
104
 
105
- try:
106
- cv_embedding = model.encode(text)
107
-
108
- score = cosine_similarity(
109
- [cv_embedding],
110
- [jd_embedding]
111
- )[0][0]
112
-
113
- # filter threshold (adjust if needed)
114
- if score < 0.60:
115
- continue
116
-
117
- fields = extract_cv_fields(text)
118
-
119
- results.append({
120
- "Name (Age)": fields["Name (Age)"],
121
- "Contact": fields["Contact"],
122
- "Current Job": fields["Current Job"],
123
- "Qualification": fields["Qualification"],
124
- "Experience": fields["Experience"],
125
- "Publications": fields["Publications"],
126
- "Citation": fields["Citation"],
127
- "H-index": fields["H-index"],
128
- "Nationality": fields["Nationality"],
129
- "Other Achievements": fields["Other Achievements"],
130
- "Area": fields["Area"],
131
- "Comments": fields["Comments"]
132
- })
133
-
134
- except Exception as e:
135
- print(f"Error processing {file}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- if not results:
138
- raise gr.Error("No matching CVs found for this JD.")
139
 
140
- df = pd.DataFrame(results)
141
 
142
- output_file = "output.xlsx"
143
- df.to_excel(output_file, index=False)
144
 
145
- return output_file
 
 
 
146
 
147
 
148
- # -------------------------
149
  # GRADIO UI
150
- # -------------------------
151
- demo = gr.Interface(
152
- fn=process_zip,
153
- inputs=[
154
- gr.File(file_types=[".zip"]),
155
- gr.Textbox(lines=10, label="Job Description (JD)")
156
- ],
157
- outputs=gr.File(label="Download Filtered CV Excel"),
158
- title="AI CV Screening System",
159
- description="Upload ZIP of CVs + Job Description → Get ranked candidates in Excel"
160
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  demo.launch()
 
 
 
1
  import os
2
+ import re
3
+ import zipfile
4
+ import tempfile
5
  import pandas as pd
6
+ import pdfplumber
7
+ import fitz # PyMuPDF
8
+ import gradio as gr
 
 
 
 
 
 
9
 
10
+ from docx import Document
11
+ from sentence_transformers import SentenceTransformer, util
12
+
13
+ # =============================================================
14
+ # CONFIG
15
+ # =============================================================
16
+ # Upload this Excel file to the root of your HF Space
17
+ INTERNAL_EXCEL_FILE = "Summary_of_Faculty_Rankig_16th Feb 2025.xlsx"
18
+
19
+ # Your fine-tuned model on Hugging Face Hub
20
+ MODEL_NAME = "csAhmad/zoraiz-model"
21
+
22
+ # Exact output columns matching your Excel (Area has a trailing space — preserved)
23
+ OUTPUT_COLUMNS = [
24
+ "Rank", "Selection Status", "Match Score",
25
+ "Name (Age)", "Contact", "Current Job", "Qualifciation",
26
+ "Experience", "Publications", "Citation", "H-index",
27
+ "Nationality", "Other Achievements", "Area ", "Comments",
28
+ "Source Folder", "Included Documents"
29
+ ]
30
+
31
+ # =============================================================
32
+ # LOAD MODEL (once at startup)
33
+ # =============================================================
34
+ print("Loading model...")
35
+ app_model = SentenceTransformer(MODEL_NAME)
36
+ print("Model loaded.")
37
+
38
+
39
+ # =============================================================
40
+ # HELPERS
41
+ # =============================================================
42
+ def normalize_text(text):
43
+ if pd.isna(text):
44
+ return ""
45
+ text = str(text).strip().lower()
46
+ text = re.sub(r"\s+", " ", text)
47
+ text = re.sub(r"[^a-z0-9\s]", "", text)
48
+ return text
49
 
50
 
51
+ def extract_name_only(name_age_value):
52
+ """'John Smith (35)' → 'John Smith'"""
53
+ if pd.isna(name_age_value):
54
+ return ""
55
+ text = str(name_age_value).strip()
56
+ text = re.sub(r"\s*\([^)]*\)\s*", " ", text)
57
+ text = re.sub(r"\s+", " ", text).strip()
58
+ return text
59
+
60
+
61
+ def name_to_tokens(name):
62
+ name = normalize_text(name)
63
+ return [t for t in name.split() if len(t) >= 2]
64
+
65
+
66
+ def detect_document_type(file_name):
67
+ name = str(file_name).lower()
68
+ if "cv" in name or "resume" in name:
69
+ return "cv"
70
+ elif "cover" in name:
71
+ return "cover_letter"
72
+ elif "research" in name:
73
+ return "research_statement"
74
+ elif "teaching" in name:
75
+ return "teaching_statement"
76
+ elif "publication" in name:
77
+ return "publication_list"
78
+ elif "reference" in name:
79
+ return "reference"
80
+ elif "transcript" in name or "degree" in name or "certificate" in name:
81
+ return "academic_document"
82
+ elif "passport" in name or "visa" in name:
83
+ return "identity_document"
84
+ else:
85
+ return "other"
86
+
87
+
88
+ # =============================================================
89
  # TEXT EXTRACTION
90
+ # =============================================================
91
+ def extract_text_from_pdf(file_path):
92
+ text = ""
93
+ # pdfplumber first
94
  try:
95
+ with pdfplumber.open(file_path) as pdf:
96
+ for page in pdf.pages:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  try:
98
+ t = page.extract_text()
99
+ if t:
100
+ text += t + "\n"
101
+ except Exception:
102
  pass
103
+ except Exception:
104
+ pass
105
+
106
+ # PyMuPDF fallback
107
+ if not text.strip():
108
+ try:
109
+ doc = fitz.open(file_path)
110
+ for page in doc:
111
+ t = page.get_text("text")
112
+ if t:
113
+ text += t + "\n"
114
+ doc.close()
115
+ except Exception as e:
116
+ print(f"[PDF error] {file_path}: {e}")
117
+
118
+ return text
119
+
120
+
121
+ def extract_text_from_docx(file_path):
122
+ text = ""
123
+ try:
124
+ doc = Document(file_path)
125
+ for para in doc.paragraphs:
126
+ if para.text:
127
+ text += para.text + "\n"
128
+ except Exception as e:
129
+ print(f"[DOCX error] {file_path}: {e}")
130
+ return text
131
+
132
+
133
+ def extract_document_text(file_path):
134
+ ext = os.path.splitext(file_path)[1].lower()
135
+ if ext == ".pdf":
136
+ return extract_text_from_pdf(file_path)
137
+ elif ext in [".docx", ".doc"]:
138
+ return extract_text_from_docx(file_path)
139
+ elif ext == ".txt":
140
+ if not os.path.exists(file_path):
141
+ return ""
142
+ try:
143
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
144
+ return f.read()
145
+ except Exception:
146
+ return ""
147
+ return ""
148
 
 
 
 
149
 
150
+ # =============================================================
151
+ # MATCHING: CV folder name → Excel row
152
+ # =============================================================
153
+ def match_by_token_overlap(matching_text, excel_df, min_hits=2):
154
+ text_clean = normalize_text(matching_text)
155
+ best_idx = None
156
+ best_hits = -1
157
+ best_score = -1
158
+ best_name = None
159
+
160
+ for idx, row in excel_df.iterrows():
161
+ tokens = row["candidate_name_tokens"]
162
+ if not tokens:
163
+ continue
164
+ hits = sum(1 for t in tokens if t in text_clean)
165
+ coverage = hits / max(len(tokens), 1)
166
+ score = hits + coverage
167
+
168
+ if hits > best_hits or (hits == best_hits and score > best_score):
169
+ best_idx = idx
170
+ best_hits = hits
171
+ best_score = score
172
+ best_name = row["candidate_name_only"]
173
+
174
+ return (best_idx, best_name) if best_hits >= min_hits else (None, None)
175
+
176
+
177
+ # =============================================================
178
+ # BUILD RICH PROFILE TEXT FOR SEMANTIC MODEL
179
+ # =============================================================
180
+ def build_candidate_profile(row):
181
+ """
182
+ Combines the pre-filled Excel fields + extracted CV document text
183
+ into one string for the semantic model to score against the JD.
184
+ """
185
+ parts = []
186
+
187
+ # Excel fields (already filled in by your team)
188
+ fields = [
189
+ ("Name", row.get("Name (Age)", "")),
190
+ ("Current Job", row.get("Current Job", "")),
191
+ ("Qualification", row.get("Qualifciation", "")), # typo preserved from Excel
192
+ ("Experience", row.get("Experience", "")),
193
+ ("Publications", row.get("Publications", "")),
194
+ ("Citations", row.get("Citation", "")),
195
+ ("H-index", row.get("H-index", "")),
196
+ ("Nationality", row.get("Nationality", "")),
197
+ ("Achievements", row.get("Other Achievements", "")),
198
+ ("Area", row.get("Area ", "")), # trailing space preserved
199
+ ("Comments", row.get("Comments", "")),
200
+ ]
201
+
202
+ for label, value in fields:
203
+ value = str(value).strip()
204
+ if value and value.lower() != "nan":
205
+ parts.append(f"{label}: {value}")
206
+
207
+ # Extracted CV document text
208
+ cv_text = str(row.get("combined_profile_text", "")).strip()
209
+ if cv_text:
210
+ parts.append(f"CV Documents:\n{cv_text}")
211
+
212
+ return "\n".join(parts).strip()
213
+
214
+
215
+ # =============================================================
216
+ # MAIN PIPELINE
217
+ # =============================================================
218
+ def run_pipeline(zip_file_path, job_description_text):
219
+
220
+ work_dir = tempfile.mkdtemp(prefix="cv_rank_")
221
+ extract_folder = os.path.join(work_dir, "documents")
222
+ os.makedirs(extract_folder, exist_ok=True)
223
+
224
+ # ------ STEP 1: Load internal Excel ------
225
+ if not os.path.exists(INTERNAL_EXCEL_FILE):
226
+ raise FileNotFoundError(
227
+ f"Internal dataset not found: '{INTERNAL_EXCEL_FILE}'. "
228
+ "Please upload it to the root of your HF Space."
229
+ )
230
+
231
+ df = pd.read_excel(INTERNAL_EXCEL_FILE)
232
+
233
+ # Strip whitespace from all column names
234
+ df.columns = df.columns.str.strip()
235
+
236
+ # NOTE: After stripping, "Area " becomes "Area" — re-add trailing space
237
+ # to stay consistent with Excel original
238
+ if "Area" in df.columns and "Area " not in df.columns:
239
+ df = df.rename(columns={"Area": "Area "})
240
+
241
+ df["candidate_name_raw"] = df["Name (Age)"].astype(str)
242
+ df["candidate_name_only"] = df["candidate_name_raw"].apply(extract_name_only)
243
+ df["candidate_name_tokens"] = df["candidate_name_only"].apply(name_to_tokens)
244
+
245
+ # Fill NaN in key columns
246
+ for col in ["Other Achievements", "Area ", "Comments", "Contact",
247
+ "Current Job", "Qualifciation", "Experience",
248
+ "Publications", "Citation", "H-index", "Nationality"]:
249
+ if col in df.columns:
250
+ df[col] = df[col].fillna("")
251
+
252
+ # ------ STEP 2: Extract ZIP ------
253
  try:
254
+ with zipfile.ZipFile(zip_file_path, "r") as z:
255
+ z.extractall(extract_folder)
256
  except zipfile.BadZipFile:
257
+ raise ValueError("Invalid ZIP file.")
 
 
 
258
 
259
+ # ------ STEP 3: Scan documents ------
260
+ valid_ext = {".pdf", ".docx", ".doc"}
261
+ doc_rows = []
262
 
263
+ for root, _, files in os.walk(extract_folder):
264
+ for fname in files:
265
+ if fname.startswith(".") or fname.startswith("__"):
266
+ continue
267
+ ext = os.path.splitext(fname)[1].lower()
268
+ if ext not in valid_ext:
 
 
269
  continue
270
 
271
+ full_path = os.path.join(root, fname)
272
+ rel_path = os.path.relpath(full_path, extract_folder)
273
+ folder_name = os.path.dirname(rel_path)
274
+
275
+ if folder_name in ("", "."):
276
+ folder_name = os.path.splitext(fname)[0]
277
+
278
+ doc_rows.append({
279
+ "file_name": fname,
280
+ "full_path": full_path,
281
+ "folder_name": folder_name,
282
+ "extension": ext
283
+ })
284
+
285
+ if not doc_rows:
286
+ raise ValueError("No valid PDF or DOCX files found in the ZIP.")
287
+
288
+ docs_df = pd.DataFrame(doc_rows)
289
+
290
+ # ------ STEP 4: Extract text ------
291
+ text_rows = []
292
+ for _, row in docs_df.iterrows():
293
+ text = extract_document_text(row["full_path"])
294
+ text = text.replace("\x00", " ")
295
+ text = re.sub(r"[ \t]+", " ", text)
296
+ text = re.sub(r"\n{3,}", "\n\n", text).strip()
297
+ status = "success" if text else "empty"
298
+
299
+ text_rows.append({
300
+ "file_name": row["file_name"],
301
+ "folder_name": row["folder_name"],
302
+ "text": text,
303
+ "status": status,
304
+ "doc_type": detect_document_type(row["file_name"])
305
+ })
306
+
307
+ text_df = pd.DataFrame(text_rows)
308
+
309
+ # Keep useful doc types; fall back to all readable
310
+ useful_types = {"cv", "cover_letter", "research_statement", "teaching_statement", "publication_list"}
311
+ useful_df = text_df[(text_df["status"] == "success") & (text_df["doc_type"].isin(useful_types))].copy()
312
+
313
+ if useful_df.empty:
314
+ print("[Warning] No files matched standard doc types — using all readable files.")
315
+ useful_df = text_df[text_df["status"] == "success"].copy()
316
+
317
+ if useful_df.empty:
318
+ raise ValueError("No readable documents found in the ZIP.")
319
+
320
+ # ------ STEP 5: Build one combined profile per folder ------
321
+ doc_priority = {"cv": 1, "research_statement": 2, "teaching_statement": 3,
322
+ "publication_list": 4, "cover_letter": 5, "other": 99}
323
+
324
+ useful_df["priority"] = useful_df["doc_type"].map(doc_priority).fillna(99)
325
+ useful_df = useful_df.sort_values(["folder_name", "priority", "file_name"]).reset_index(drop=True)
326
+
327
+ profiles = []
328
+ for folder_name, group in useful_df.groupby("folder_name"):
329
+ parts = []
330
+ included_files = []
331
+ included_types = []
332
+
333
+ for _, doc_row in group.iterrows():
334
+ t = str(doc_row["text"]).strip()
335
+ if not t:
336
+ continue
337
+ parts.append(
338
+ f"\n--- {doc_row['doc_type'].upper()} | {doc_row['file_name']} ---\n{t}"
339
+ )
340
+ included_files.append(doc_row["file_name"])
341
+ included_types.append(doc_row["doc_type"])
342
+
343
+ profiles.append({
344
+ "folder_name": folder_name,
345
+ "combined_profile_text": "\n".join(parts).strip(),
346
+ "included_files": " | ".join(included_files),
347
+ "included_doc_types": " | ".join(sorted(set(included_types)))
348
+ })
349
+
350
+ profiles_df = pd.DataFrame(profiles)
351
+
352
+ if profiles_df.empty:
353
+ raise ValueError("No candidate profiles could be built.")
354
+
355
+ # Build matching text (folder name + filenames + first 1500 chars of profile)
356
+ profiles_df["matching_text"] = profiles_df.apply(
357
+ lambda r: f"{r['folder_name']}\n{r['included_files']}\n{r['combined_profile_text'][:1500]}",
358
+ axis=1
359
+ )
360
+
361
+ # ------ STEP 6: Match folders → Excel rows ------
362
+ matches = []
363
+ for _, row in profiles_df.iterrows():
364
+ matched_idx, matched_name = match_by_token_overlap(
365
+ row["matching_text"], df, min_hits=2
366
+ )
367
+ matches.append({
368
+ "folder_name": row["folder_name"],
369
+ "matched_excel_index": matched_idx,
370
+ "matched_name": matched_name
371
+ })
372
+
373
+ matches_df = pd.DataFrame(matches)
374
+ matched_only = matches_df[matches_df["matched_excel_index"].notna()].copy()
375
+
376
+ if matched_only.empty:
377
+ raise ValueError(
378
+ "No candidates could be matched between ZIP folder names and the Excel dataset. "
379
+ "Ensure ZIP folder names contain the candidate names from the Excel file."
380
+ )
381
+
382
+ # Merge with Excel rows
383
+ merged_df = matched_only.merge(
384
+ df.reset_index().rename(columns={"index": "excel_index"}),
385
+ left_on="matched_excel_index",
386
+ right_on="excel_index",
387
+ how="left"
388
+ )
389
+
390
+ # ------ STEP 7: Merge with profile texts ------
391
+ final_df = merged_df.merge(
392
+ profiles_df[["folder_name", "combined_profile_text", "included_files", "included_doc_types"]],
393
+ on="folder_name",
394
+ how="left"
395
+ )
396
+
397
+ for col in ["combined_profile_text", "included_files", "included_doc_types"]:
398
+ final_df[col] = final_df[col].fillna("")
399
+
400
+ # Build rich profile string for model
401
+ final_df["candidate_profile_for_model"] = final_df.apply(build_candidate_profile, axis=1)
402
+
403
+ # ------ STEP 8: Semantic scoring ------
404
+ job_embedding = app_model.encode(
405
+ job_description_text,
406
+ convert_to_tensor=True,
407
+ normalize_embeddings=True
408
+ )
409
+
410
+ cand_embeddings = app_model.encode(
411
+ final_df["candidate_profile_for_model"].tolist(),
412
+ convert_to_tensor=True,
413
+ normalize_embeddings=True
414
+ )
415
+
416
+ scores = util.cos_sim(job_embedding, cand_embeddings)[0]
417
+ final_df["Match Score"] = scores.cpu().numpy().round(4)
418
+
419
+ # ------ STEP 9: Rank and shortlist (above median) ------
420
+ ranked_df = final_df.sort_values("Match Score", ascending=False).reset_index(drop=True)
421
+ threshold = ranked_df["Match Score"].median()
422
+
423
+ shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
424
+ shortlisted["Rank"] = shortlisted.index + 1
425
+ shortlisted["Selection Status"] = "Selected"
426
+ shortlisted["Source Folder"] = shortlisted["folder_name"]
427
+ shortlisted["Included Documents"] = shortlisted["included_doc_types"]
428
+
429
+ # ------ STEP 10: Build final output with exact Excel columns ------
430
+ # Ensure all output columns exist
431
+ for col in OUTPUT_COLUMNS:
432
+ if col not in shortlisted.columns:
433
+ shortlisted[col] = ""
434
+
435
+ existing_cols = [c for c in OUTPUT_COLUMNS if c in shortlisted.columns]
436
+ final_output = shortlisted[existing_cols].copy()
437
+
438
+ # Round Match Score for display
439
+ final_output["Match Score"] = final_output["Match Score"].round(4)
440
+
441
+ # ------ STEP 11: Save Excel ------
442
+ output_path = os.path.join(work_dir, "shortlisted_ranked_candidates.xlsx")
443
+
444
+ with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
445
+ final_output.to_excel(writer, index=False, sheet_name="Shortlisted Candidates")
446
+
447
+ # Auto-adjust column widths
448
+ worksheet = writer.sheets["Shortlisted Candidates"]
449
+ for i, col in enumerate(final_output.columns):
450
+ max_len = max(
451
+ final_output[col].astype(str).map(len).max(),
452
+ len(col)
453
+ )
454
+ worksheet.set_column(i, i, min(max_len + 2, 60))
455
+
456
+ summary = (
457
+ f"Total candidates processed : {len(ranked_df)}\n"
458
+ f"Shortlisted (above median) : {len(final_output)}\n"
459
+ f"Match score threshold : {threshold:.4f}\n"
460
+ f"Unmatched folders skipped : {len(matches_df) - len(matched_only)}"
461
+ )
462
+
463
+ return final_output, output_path, summary
464
+
465
+
466
+ # =============================================================
467
+ # GRADIO WRAPPER
468
+ # =============================================================
469
+ def gradio_app(zip_file, job_description_text):
470
+ try:
471
+ if zip_file is None:
472
+ raise gr.Error("Please upload the ZIP file containing candidate CVs.")
473
+ if not job_description_text or not str(job_description_text).strip():
474
+ raise gr.Error("Please provide the job description.")
475
 
476
+ zip_path = zip_file if isinstance(zip_file, str) else zip_file.name
 
477
 
478
+ results_df, output_path, summary = run_pipeline(zip_path, job_description_text)
479
 
480
+ return results_df, output_path, summary
 
481
 
482
+ except gr.Error:
483
+ raise
484
+ except Exception as e:
485
+ raise gr.Error(f"Error: {str(e)}")
486
 
487
 
488
+ # =============================================================
489
  # GRADIO UI
490
+ # =============================================================
491
+ with gr.Blocks(title="AI CV Matching & Ranking System") as demo:
492
+
493
+ gr.Markdown("""
494
+ # AI-Based CV Matching & Ranking System
495
+ Upload a ZIP file of candidate CVs and paste the job description.
496
+ The system matches CVs to the internal candidate dataset, scores them
497
+ with a fine-tuned semantic model, and returns a ranked shortlist Excel file.
498
+ """)
499
+
500
+ with gr.Row():
501
+ with gr.Column():
502
+ zip_input = gr.File(
503
+ label="Upload Candidate CV ZIP File",
504
+ file_types=[".zip"],
505
+ type="filepath"
506
+ )
507
+ job_input = gr.Textbox(
508
+ label="Paste Job Description",
509
+ lines=15,
510
+ placeholder="Paste the full job description here..."
511
+ )
512
+ run_button = gr.Button("Match & Rank Candidates", variant="primary")
513
+
514
+ with gr.Column():
515
+ summary_output = gr.Textbox(
516
+ label="Processing Summary",
517
+ lines=5,
518
+ interactive=False
519
+ )
520
+ results_output = gr.Dataframe(
521
+ label="Shortlisted Ranked Candidates",
522
+ interactive=False,
523
+ wrap=True
524
+ )
525
+ excel_download = gr.File(
526
+ label="Download Ranked Excel Output"
527
+ )
528
+
529
+ run_button.click(
530
+ fn=gradio_app,
531
+ inputs=[zip_input, job_input],
532
+ outputs=[results_output, excel_download, summary_output]
533
+ )
534
 
535
  demo.launch()