csAhmad commited on
Commit
6640ec0
·
verified ·
1 Parent(s): 6c77377

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -12
app.py CHANGED
@@ -19,13 +19,11 @@ INTERNAL_EXCEL_FILE = "Summary_of_Faculty_Rankig_16th Feb 2025.xlsx"
19
  # Your fine-tuned model on Hugging Face Hub
20
  MODEL_NAME = "csAhmad/zoraiz-model"
21
 
22
- # Exact output columns matching your Excel (Area has a trailing space — preserved)
23
  OUTPUT_COLUMNS = [
24
- "Rank", "Selection Status", "Match Score",
25
  "Name (Age)", "Contact", "Current Job", "Qualifciation",
26
  "Experience", "Publications", "Citation", "H-index",
27
- "Nationality", "Other Achievements", "Area ", "Comments",
28
- "Source Folder", "Included Documents"
29
  ]
30
 
31
  # =============================================================
@@ -49,13 +47,32 @@ def normalize_text(text):
49
 
50
 
51
  def extract_name_only(name_age_value):
52
- """'John Smith (35)' 'John Smith'"""
53
  if pd.isna(name_age_value):
54
  return ""
55
  text = str(name_age_value).strip()
56
- text = re.sub(r"\s*\([^)]*\)\s*", " ", text)
57
- text = re.sub(r"\s+", " ", text).strip()
58
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  def name_to_tokens(name):
@@ -421,10 +438,8 @@ def run_pipeline(zip_file_path, job_description_text):
421
  threshold = ranked_df["Match Score"].median()
422
 
423
  shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
424
- shortlisted["Rank"] = shortlisted.index + 1
425
- shortlisted["Selection Status"] = "Selected"
426
- shortlisted["Source Folder"] = shortlisted["folder_name"]
427
- shortlisted["Included Documents"] = shortlisted["included_doc_types"]
428
 
429
  # ------ STEP 10: Build final output with exact Excel columns ------
430
  # Ensure all output columns exist
 
19
  # Your fine-tuned model on Hugging Face Hub
20
  MODEL_NAME = "csAhmad/zoraiz-model"
21
 
22
+ # Exact output columns matches your original Excel exactly
23
  OUTPUT_COLUMNS = [
 
24
  "Name (Age)", "Contact", "Current Job", "Qualifciation",
25
  "Experience", "Publications", "Citation", "H-index",
26
+ "Nationality", "Other Achievements", "Area ", "Comments"
 
27
  ]
28
 
29
  # =============================================================
 
47
 
48
 
49
  def extract_name_only(name_age_value):
50
+ """Strips URLs, age brackets, and returns clean name only."""
51
  if pd.isna(name_age_value):
52
  return ""
53
  text = str(name_age_value).strip()
54
+
55
+ # Remove URLs
56
+ text = re.sub(r'https?://\S+', '', text)
57
+
58
+ # Remove age/date in brackets e.g. (35) or (Date of birth: ...)
59
+ text = re.sub(r'\([^)]*\)', '', text)
60
+
61
+ # Find first line that looks like a real name
62
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
63
+ name = ""
64
+ for line in lines:
65
+ # Skip emails, long lines, pure numbers, known non-name keywords
66
+ if '@' in line or len(line) > 60:
67
+ continue
68
+ if re.match(r'^[\d\s\+\-\(\)]+$', line):
69
+ continue
70
+ if any(kw in line.lower() for kw in ['scholar', 'citation', 'http', 'www', 'email', 'phone', 'mobile']):
71
+ continue
72
+ name = line
73
+ break
74
+
75
+ return re.sub(r'\s+', ' ', name).strip()
76
 
77
 
78
  def name_to_tokens(name):
 
438
  threshold = ranked_df["Match Score"].median()
439
 
440
  shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
441
+ # Clean up Name (Age) — strip URLs and show name only
442
+ shortlisted["Name (Age)"] = shortlisted["Name (Age)"].apply(extract_name_only)
 
 
443
 
444
  # ------ STEP 10: Build final output with exact Excel columns ------
445
  # Ensure all output columns exist