Spaces:
Sleeping
Sleeping
File size: 4,861 Bytes
5b74b8c e32d554 5b74b8c e32d554 add527f 5b74b8c add527f 5b74b8c 351c4ea e32d554 5b74b8c add527f 5b74b8c add527f 5b74b8c add527f 5b74b8c add527f 5b74b8c add527f 5b74b8c e32d554 5b74b8c e32d554 5b74b8c add527f 5b74b8c add527f 5b74b8c add527f 5b74b8c add527f 5b74b8c e32d554 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import os
os.environ["WANDB_DISABLED"] = "true" # Disable online logging
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
# ------------------------
# Load CSV and pre-fine-tuned model
# ------------------------
MODEL_PATH = os.path.join(os.path.dirname(__file__), "fine_tuned_internship_model")
model = SentenceTransformer(MODEL_PATH)
df = pd.read_csv("converted (1).csv")
# ------------------------
# Normalization functions
# ------------------------
SKILL_MAP = {
"js": "javascript", "reactjs": "react", "nodejs": "node.js",
"cpp": "c++", "btech": "bachelor of technology",
"cs": "computer science", "ece": "electronics and communication",
"ml": "machine learning", "ai": "artificial intelligence",
"it": "information technology"
}
def normalize_text(text):
if not text: return ""
text = text.lower().replace(".", "").replace(",", "")
for k,v in SKILL_MAP.items(): text = text.replace(k,v)
return text.strip()
def normalize_skills(skills_str):
skills = [normalize_text(s.strip()) for s in skills_str.split(",")]
return [s for s in skills if s]
def location_similarity(candidate_loc, internship_loc):
candidate_loc = candidate_loc.lower().strip()
internship_loc = internship_loc.lower().strip()
if candidate_loc == internship_loc: return 100
if candidate_loc.split()[0] == internship_loc.split()[0]: return 70
return 0
# ------------------------
# Cache embeddings for all internships
# ------------------------
cached_internships = []
for _, row in df.iterrows():
internship_skills = normalize_skills(row['Required_Skills'])
internship_skill_embs = [model.encode(s, convert_to_tensor=True) for s in internship_skills]
internship_edu_emb = model.encode(normalize_text(row['Student_Education']), convert_to_tensor=True)
internship_interest_emb = model.encode(normalize_text(row['Student_Interest']), convert_to_tensor=True)
cached_internships.append({
"row": row,
"skill_embs": internship_skill_embs,
"edu_emb": internship_edu_emb,
"interest_emb": internship_interest_emb
})
weights = np.array([0.4,0.3,0.2,0.1]) # Skills, Education, Interest, Location
intercept = 0
# ------------------------
# Matching function
# ------------------------
def match_internship(skills, education, interest, location):
candidate_skills_input = normalize_skills(skills)
candidate_education_input = normalize_text(education)
candidate_interest_input = normalize_text(interest)
candidate_location_input = location
candidate_skill_embs_input = [model.encode(s, convert_to_tensor=True) for s in candidate_skills_input]
candidate_edu_emb_input = model.encode(candidate_education_input, convert_to_tensor=True)
candidate_interest_emb_input = model.encode(candidate_interest_input, convert_to_tensor=True)
results = []
for internship in cached_internships:
row = internship["row"]
# Skill similarity
skill_sims = []
for c_emb in candidate_skill_embs_input:
max_sim = max([util.cos_sim(c_emb, i_emb).item() for i_emb in internship["skill_embs"]], default=0)
skill_sims.append(max_sim)
skills_sim = np.mean(skill_sims)*100 if skill_sims else 0
# Education similarity
edu_sim = util.cos_sim(candidate_edu_emb_input, internship["edu_emb"]).item()*100
# Interest similarity
interest_sim = util.cos_sim(candidate_interest_emb_input, internship["interest_emb"]).item()*100
# Location similarity
loc_sim = location_similarity(candidate_location_input, row['Location'])
# Overall match
overall = np.dot(weights, [skills_sim, edu_sim, interest_sim, loc_sim]) + intercept
results.append({
"Company": row['Company'],
"Position": row['Position'],
"Skills_Match": skills_sim,
"Education_Match": edu_sim,
"Interest_Match": interest_sim,
"Location_Match": loc_sim,
"Overall_Match": overall
})
results.sort(key=lambda x: x['Overall_Match'], reverse=True)
# Return top 5 as list of dicts
return results[:5]
# ------------------------
# Gradio interface
# ------------------------
inputs = [
gr.Textbox(label="Your Skills (comma-separated)"),
gr.Textbox(label="Your Education"),
gr.Textbox(label="Your Interest / Field"),
gr.Textbox(label="Your Location")
]
outputs = gr.JSON(label="Top 5 Internship Matches")
demo = gr.Interface(
fn=match_internship,
inputs=inputs,
outputs=outputs,
title="Super-Intelligent Internship Matcher",
description="Enter your skills, education, interest, and location to get top 5 internship matches."
)
if __name__ == "__main__":
demo.launch()
|