Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,13 +19,11 @@ INTERNAL_EXCEL_FILE = "Summary_of_Faculty_Rankig_16th Feb 2025.xlsx"
|
|
| 19 |
# Your fine-tuned model on Hugging Face Hub
|
| 20 |
MODEL_NAME = "csAhmad/zoraiz-model"
|
| 21 |
|
| 22 |
-
# Exact output columns
|
| 23 |
OUTPUT_COLUMNS = [
|
| 24 |
-
"Rank", "Selection Status", "Match Score",
|
| 25 |
"Name (Age)", "Contact", "Current Job", "Qualifciation",
|
| 26 |
"Experience", "Publications", "Citation", "H-index",
|
| 27 |
-
"Nationality", "Other Achievements", "Area ", "Comments"
|
| 28 |
-
"Source Folder", "Included Documents"
|
| 29 |
]
|
| 30 |
|
| 31 |
# =============================================================
|
|
@@ -49,13 +47,32 @@ def normalize_text(text):
|
|
| 49 |
|
| 50 |
|
| 51 |
def extract_name_only(name_age_value):
|
| 52 |
-
"""
|
| 53 |
if pd.isna(name_age_value):
|
| 54 |
return ""
|
| 55 |
text = str(name_age_value).strip()
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
def name_to_tokens(name):
|
|
@@ -421,10 +438,8 @@ def run_pipeline(zip_file_path, job_description_text):
|
|
| 421 |
threshold = ranked_df["Match Score"].median()
|
| 422 |
|
| 423 |
shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
|
| 424 |
-
|
| 425 |
-
shortlisted["
|
| 426 |
-
shortlisted["Source Folder"] = shortlisted["folder_name"]
|
| 427 |
-
shortlisted["Included Documents"] = shortlisted["included_doc_types"]
|
| 428 |
|
| 429 |
# ------ STEP 10: Build final output with exact Excel columns ------
|
| 430 |
# Ensure all output columns exist
|
|
|
|
| 19 |
# Your fine-tuned model on Hugging Face Hub
|
| 20 |
MODEL_NAME = "csAhmad/zoraiz-model"
|
| 21 |
|
| 22 |
+
# Exact output columns — matches your original Excel exactly
|
| 23 |
OUTPUT_COLUMNS = [
|
|
|
|
| 24 |
"Name (Age)", "Contact", "Current Job", "Qualifciation",
|
| 25 |
"Experience", "Publications", "Citation", "H-index",
|
| 26 |
+
"Nationality", "Other Achievements", "Area ", "Comments"
|
|
|
|
| 27 |
]
|
| 28 |
|
| 29 |
# =============================================================
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def extract_name_only(name_age_value):
|
| 50 |
+
"""Strips URLs, age brackets, and returns clean name only."""
|
| 51 |
if pd.isna(name_age_value):
|
| 52 |
return ""
|
| 53 |
text = str(name_age_value).strip()
|
| 54 |
+
|
| 55 |
+
# Remove URLs
|
| 56 |
+
text = re.sub(r'https?://\S+', '', text)
|
| 57 |
+
|
| 58 |
+
# Remove age/date in brackets e.g. (35) or (Date of birth: ...)
|
| 59 |
+
text = re.sub(r'\([^)]*\)', '', text)
|
| 60 |
+
|
| 61 |
+
# Find first line that looks like a real name
|
| 62 |
+
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
| 63 |
+
name = ""
|
| 64 |
+
for line in lines:
|
| 65 |
+
# Skip emails, long lines, pure numbers, known non-name keywords
|
| 66 |
+
if '@' in line or len(line) > 60:
|
| 67 |
+
continue
|
| 68 |
+
if re.match(r'^[\d\s\+\-\(\)]+$', line):
|
| 69 |
+
continue
|
| 70 |
+
if any(kw in line.lower() for kw in ['scholar', 'citation', 'http', 'www', 'email', 'phone', 'mobile']):
|
| 71 |
+
continue
|
| 72 |
+
name = line
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
return re.sub(r'\s+', ' ', name).strip()
|
| 76 |
|
| 77 |
|
| 78 |
def name_to_tokens(name):
|
|
|
|
| 438 |
threshold = ranked_df["Match Score"].median()
|
| 439 |
|
| 440 |
shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
|
| 441 |
+
# Clean up Name (Age) — strip URLs and show name only
|
| 442 |
+
shortlisted["Name (Age)"] = shortlisted["Name (Age)"].apply(extract_name_only)
|
|
|
|
|
|
|
| 443 |
|
| 444 |
# ------ STEP 10: Build final output with exact Excel columns ------
|
| 445 |
# Ensure all output columns exist
|