Spaces:

zlf18
/

test2

Sleeping

App Files Files Community

zlf18 commited on Oct 12, 2025

Commit

ac17735

verified ·

1 Parent(s): df56c68

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -50

app.py CHANGED Viewed

@@ -196,44 +196,41 @@ def initialize_data_and_model():
         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
-        def extract_skills_llm(text: str) -> list[str]:
-            if not isinstance(text, str) or len(text.strip()) < 20 or not LLM_PIPELINE: return []
             prompt = f"""
-Instruct: You are an expert technical recruiter. Extract the key skills from the job description text. List technical and soft skills as a comma-separated string.
-[Example 1]
-Text: "Requires 3+ years of experience in cloud infrastructure. Must be proficient in AWS, particularly EC2 and S3. Experience with Terraform for IaC is a plus."
-Extracted Skills: cloud infrastructure, aws, ec2, s3, terraform, infrastructure as code
-[Example 2]
-Text: "Seeking a team lead with strong project management abilities. Must communicate effectively with stakeholders and manage timelines using Agile methodologies like Scrum."
-Extracted Skills: project management, leadership, stakeholder communication, agile, scrum
-[Actual Task]
-Text: "{text}"
-Extracted Skills:
-"""
             try:
                 response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
                 generated_text = response[0]['generated_text']
-                skills_part = generated_text.split("Extracted Skills:")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
                 return list(dict.fromkeys(s.lower() for s in skills))
-            except Exception: return []
-        def extract_skills_nltk(text: str) -> list[str]:
-            if not isinstance(text, str): return []
-            text_lower = text.lower()
-            grammar = "NP: {<JJ.*>*<NN.*>+}"
-            chunk_parser = nltk.RegexpParser(grammar)
-            tokens = nltk.word_tokenize(text_lower)
-            tagged_tokens = nltk.pos_tag(tokens)
-            chunked_text = chunk_parser.parse(tagged_tokens)
-            potential_skills = set()
-            for subtree in chunked_text.subtrees():
-                if subtree.label() == 'NP':
-                    phrase = " ".join(word for word, tag in subtree.leaves())
-                    if _norm_skill_token(phrase) in SKILL_WHITELIST:
-                        potential_skills.add(_norm_skill_token(phrase))
-            return sorted(list(potential_skills))
         def extract_skills_direct_scan(text: str) -> list[str]:
             if not isinstance(text, str): return []
             found_skills = set()
@@ -247,12 +244,12 @@ Extracted Skills:
             skills_to_add = 6 - len(existing_skills)
             prompt = f"""
-Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
-Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
-List only the new skills, separated by commas. Do not repeat skills from the original list.
-Additional Skills:
-"""
             try:
                 response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
                 generated_text = response[0]['generated_text']
@@ -262,31 +259,38 @@ Additional Skills:
             except Exception:
                 return []
         def extract_skills_hybrid(row) -> list[str]:
-            text = row['text_for_skills']
-            job_title = row.get('Job title', '') # Use original Job title for context
-            llm_skills = extract_skills_llm(text)
-            nltk_skills = extract_skills_nltk(text)
-            direct_skills = extract_skills_direct_scan(text)
-            combined_skills = set(llm_skills) | set(nltk_skills) | set(direct_skills)
-            # If the combined list is still too short, expand it
             if len(combined_skills) < 6:
                 expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
                 combined_skills.update(expanded_skills)
             return sorted(list(combined_skills))
-        def create_text_for_skills(row):
-            return " ".join([str(s) for s in [row.get("Job title"), row.get("Duties"), row.get("qualifications"), row.get("Description")] if pd.notna(s)])
-        original_df["text_for_skills"] = original_df.apply(create_text_for_skills, axis=1)
         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
         # Apply the hybrid function row-wise to include job title context
         original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
-        original_df = original_df.drop(columns=['text_for_skills'])
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)

         ds = datasets.load_dataset("its-zion-18/Jobs-tabular-dataset")
         original_df = ds["original"].to_pandas()
+        # --- NEW: Advanced LLM Skill Extractor ---
+        # This new function uses a much more detailed prompt to get niche, specific skills.
+        def extract_skills_llm_advanced(job_title: str, duties: str, qualifications: str) -> list[str]:
+            if not LLM_PIPELINE: return []
+            # We combine the most important fields to give the LLM full context.
+            full_context = f"Job Title: {job_title}\n\nDuties: {duties}\n\nQualifications: {qualifications}"
+            # This prompt is highly specific to encourage better, more niche results.
             prompt = f"""
+        Instruct: You are a highly specialized technical recruiter and hiring manager. Your task is to meticulously extract a comprehensive list of the most critical and specific skills from the provided job description, paying special attention to the 'qualifications' and 'duties' sections.
+        Identify specific programming languages, software tools (e.g., AutoCAD, Figma, SAP), cloud technologies (e.g., AWS S3, Azure DevOps), data analysis tools (e.g., Tableau, Power BI), engineering concepts, and industry standards (e.g., ISO 13485, GMP).
+        Avoid overly generic soft skills like 'teamwork' or 'communication' unless they are explicitly emphasized as a core requirement. Prioritize tangible, niche competencies that truly define the role.
+        Return a single, comma-separated string of the extracted skills. Do not add any preamble or explanation.
+        [Job Description Context]
+        {full_context}
+        [Extracted Skills]
+        """
             try:
                 response = LLM_PIPELINE(prompt, max_new_tokens=150, do_sample=False, temperature=0.1)
                 generated_text = response[0]['generated_text']
+                # Robustly find the skills part after the final indicator
+                skills_part = generated_text.split("[Extracted Skills]")[-1].strip()
                 skills = [skill.strip() for skill in skills_part.split(',') if skill.strip()]
+                # Return a de-duplicated list, preserving order as much as possible
                 return list(dict.fromkeys(s.lower() for s in skills))
+            except Exception as e:
+                print(f"LLM skill extraction failed: {e}")
+                return []
         def extract_skills_direct_scan(text: str) -> list[str]:
             if not isinstance(text, str): return []
             found_skills = set()
             skills_to_add = 6 - len(existing_skills)
             prompt = f"""
+        Instruct: A job has the title "{job_title}" and requires the skills: {', '.join(existing_skills)}.
+        Based on this, what are {skills_to_add} additional, closely related skills typically required for such a role?
+        List only the new skills, separated by commas. Do not repeat skills from the original list.
+        Additional Skills:
+        """
             try:
                 response = LLM_PIPELINE(prompt, max_new_tokens=50, do_sample=True, temperature=0.5)
                 generated_text = response[0]['generated_text']
             except Exception:
                 return []
+        # --- MODIFIED: Hybrid Skill Extraction Logic ---
+        # This function is now simpler and more powerful. It prioritizes the advanced LLM extractor.
         def extract_skills_hybrid(row) -> list[str]:
+            # Extract the relevant text fields from the row
+            job_title = str(row.get('Job title', ''))
+            duties = str(row.get('Duties', ''))
+            qualifications = str(row.get('qualifications', ''))
+            description = str(row.get('Description', ''))
+            # The full text is used for the direct scan as a fallback
+            full_text_for_scan = " ".join([job_title, duties, qualifications, description])
+            # 🎯 Primary Method: Use the advanced LLM extractor for high-quality, niche skills
+            advanced_llm_skills = extract_skills_llm_advanced(job_title, duties, qualifications)
+            # 🛡️ Secondary Method: Use a direct scan as a fast and reliable backup for common skills
+            direct_skills = extract_skills_direct_scan(full_text_for_scan)
+            # Combine the results, giving priority to the LLM's findings
+            combined_skills = set(advanced_llm_skills) | set(direct_skills)
+            # If the combined list is still too short, use the LLM to expand it
             if len(combined_skills) < 6:
                 expanded_skills = expand_skills_with_llm(job_title, list(combined_skills))
                 combined_skills.update(expanded_skills)
             return sorted(list(combined_skills))
         print("--- Extracting skills with HYBRID ACCURACY model. Please wait... ---")
         # Apply the hybrid function row-wise to include job title context
         original_df['Skills'] = original_df.progress_apply(extract_skills_hybrid, axis=1)
         print(f"--- Saving processed data to {PROCESSED_DATA_PATH} for faster future startups ---")
         original_df.to_parquet(PROCESSED_DATA_PATH)