Spaces:

csAhmad
/

CV_Job_Matching_AI_Model

Sleeping

App Files Files Community

csAhmad commited on Apr 28

Commit

6640ec0

verified ·

1 Parent(s): 6c77377

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -12

app.py CHANGED Viewed

@@ -19,13 +19,11 @@ INTERNAL_EXCEL_FILE = "Summary_of_Faculty_Rankig_16th Feb 2025.xlsx"
 # Your fine-tuned model on Hugging Face Hub
 MODEL_NAME = "csAhmad/zoraiz-model"
-# Exact output columns matching your Excel (Area has a trailing space — preserved)
 OUTPUT_COLUMNS = [
-    "Rank", "Selection Status", "Match Score",
     "Name (Age)", "Contact", "Current Job", "Qualifciation",
     "Experience", "Publications", "Citation", "H-index",
-    "Nationality", "Other Achievements", "Area ", "Comments",
-    "Source Folder", "Included Documents"
 ]
 # =============================================================
@@ -49,13 +47,32 @@ def normalize_text(text):
 def extract_name_only(name_age_value):
-    """'John Smith (35)' → 'John Smith'"""
     if pd.isna(name_age_value):
         return ""
     text = str(name_age_value).strip()
-    text = re.sub(r"\s*\([^)]*\)\s*", " ", text)
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
 def name_to_tokens(name):
@@ -421,10 +438,8 @@ def run_pipeline(zip_file_path, job_description_text):
     threshold   = ranked_df["Match Score"].median()
     shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
-    shortlisted["Rank"]             = shortlisted.index + 1
-    shortlisted["Selection Status"] = "Selected"
-    shortlisted["Source Folder"]    = shortlisted["folder_name"]
-    shortlisted["Included Documents"] = shortlisted["included_doc_types"]
     # ------ STEP 10: Build final output with exact Excel columns ------
     # Ensure all output columns exist

 # Your fine-tuned model on Hugging Face Hub
 MODEL_NAME = "csAhmad/zoraiz-model"
+# Exact output columns — matches your original Excel exactly
 OUTPUT_COLUMNS = [
     "Name (Age)", "Contact", "Current Job", "Qualifciation",
     "Experience", "Publications", "Citation", "H-index",
+    "Nationality", "Other Achievements", "Area ", "Comments"
 ]
 # =============================================================
 def extract_name_only(name_age_value):
+    """Strips URLs, age brackets, and returns clean name only."""
     if pd.isna(name_age_value):
         return ""
     text = str(name_age_value).strip()
+    # Remove URLs
+    text = re.sub(r'https?://\S+', '', text)
+    # Remove age/date in brackets e.g. (35) or (Date of birth: ...)
+    text = re.sub(r'\([^)]*\)', '', text)
+    # Find first line that looks like a real name
+    lines = [l.strip() for l in text.split('\n') if l.strip()]
+    name = ""
+    for line in lines:
+        # Skip emails, long lines, pure numbers, known non-name keywords
+        if '@' in line or len(line) > 60:
+            continue
+        if re.match(r'^[\d\s\+\-\(\)]+$', line):
+            continue
+        if any(kw in line.lower() for kw in ['scholar', 'citation', 'http', 'www', 'email', 'phone', 'mobile']):
+            continue
+        name = line
+        break
+    return re.sub(r'\s+', ' ', name).strip()
 def name_to_tokens(name):
     threshold   = ranked_df["Match Score"].median()
     shortlisted = ranked_df[ranked_df["Match Score"] >= threshold].copy().reset_index(drop=True)
+    # Clean up Name (Age) — strip URLs and show name only
+    shortlisted["Name (Age)"] = shortlisted["Name (Age)"].apply(extract_name_only)
     # ------ STEP 10: Build final output with exact Excel columns ------
     # Ensure all output columns exist