Spaces:

harshlaghave
/

internship_matcher

Sleeping

App Files Files Community

internship_matcher / app.py

harshlaghave

Update app.py

e32d554 verified 5 months ago

raw

history blame contribute delete

4.86 kB

	import os
	os.environ["WANDB_DISABLED"] = "true" # Disable online logging

	import gradio as gr
	from sentence_transformers import SentenceTransformer, util
	import pandas as pd
	import numpy as np

	# ------------------------
	# Load CSV and pre-fine-tuned model
	# ------------------------
	MODEL_PATH = os.path.join(os.path.dirname(__file__), "fine_tuned_internship_model")
	model = SentenceTransformer(MODEL_PATH)
	df = pd.read_csv("converted (1).csv")

	# ------------------------
	# Normalization functions
	# ------------------------
	SKILL_MAP = {
	"js": "javascript", "reactjs": "react", "nodejs": "node.js",
	"cpp": "c++", "btech": "bachelor of technology",
	"cs": "computer science", "ece": "electronics and communication",
	"ml": "machine learning", "ai": "artificial intelligence",
	"it": "information technology"
	}

	def normalize_text(text):
	if not text: return ""
	text = text.lower().replace(".", "").replace(",", "")
	for k,v in SKILL_MAP.items(): text = text.replace(k,v)
	return text.strip()

	def normalize_skills(skills_str):
	skills = [normalize_text(s.strip()) for s in skills_str.split(",")]
	return [s for s in skills if s]

	def location_similarity(candidate_loc, internship_loc):
	candidate_loc = candidate_loc.lower().strip()
	internship_loc = internship_loc.lower().strip()
	if candidate_loc == internship_loc: return 100
	if candidate_loc.split()[0] == internship_loc.split()[0]: return 70
	return 0

	# ------------------------
	# Cache embeddings for all internships
	# ------------------------
	cached_internships = []
	for _, row in df.iterrows():
	internship_skills = normalize_skills(row['Required_Skills'])
	internship_skill_embs = [model.encode(s, convert_to_tensor=True) for s in internship_skills]
	internship_edu_emb = model.encode(normalize_text(row['Student_Education']), convert_to_tensor=True)
	internship_interest_emb = model.encode(normalize_text(row['Student_Interest']), convert_to_tensor=True)
	cached_internships.append({
	"row": row,
	"skill_embs": internship_skill_embs,
	"edu_emb": internship_edu_emb,
	"interest_emb": internship_interest_emb
	})

	weights = np.array([0.4,0.3,0.2,0.1]) # Skills, Education, Interest, Location
	intercept = 0

	# ------------------------
	# Matching function
	# ------------------------
	def match_internship(skills, education, interest, location):
	candidate_skills_input = normalize_skills(skills)
	candidate_education_input = normalize_text(education)
	candidate_interest_input = normalize_text(interest)
	candidate_location_input = location

	candidate_skill_embs_input = [model.encode(s, convert_to_tensor=True) for s in candidate_skills_input]
	candidate_edu_emb_input = model.encode(candidate_education_input, convert_to_tensor=True)
	candidate_interest_emb_input = model.encode(candidate_interest_input, convert_to_tensor=True)

	results = []
	for internship in cached_internships:
	row = internship["row"]

	# Skill similarity
	skill_sims = []
	for c_emb in candidate_skill_embs_input:
	max_sim = max([util.cos_sim(c_emb, i_emb).item() for i_emb in internship["skill_embs"]], default=0)
	skill_sims.append(max_sim)
	skills_sim = np.mean(skill_sims)*100 if skill_sims else 0

	# Education similarity
	edu_sim = util.cos_sim(candidate_edu_emb_input, internship["edu_emb"]).item()*100

	# Interest similarity
	interest_sim = util.cos_sim(candidate_interest_emb_input, internship["interest_emb"]).item()*100

	# Location similarity
	loc_sim = location_similarity(candidate_location_input, row['Location'])

	# Overall match
	overall = np.dot(weights, [skills_sim, edu_sim, interest_sim, loc_sim]) + intercept

	results.append({
	"Company": row['Company'],
	"Position": row['Position'],
	"Skills_Match": skills_sim,
	"Education_Match": edu_sim,
	"Interest_Match": interest_sim,
	"Location_Match": loc_sim,
	"Overall_Match": overall
	})

	results.sort(key=lambda x: x['Overall_Match'], reverse=True)
	# Return top 5 as list of dicts
	return results[:5]

	# ------------------------
	# Gradio interface
	# ------------------------
	inputs = [
	gr.Textbox(label="Your Skills (comma-separated)"),
	gr.Textbox(label="Your Education"),
	gr.Textbox(label="Your Interest / Field"),
	gr.Textbox(label="Your Location")
	]

	outputs = gr.JSON(label="Top 5 Internship Matches")

	demo = gr.Interface(
	fn=match_internship,
	inputs=inputs,
	outputs=outputs,
	title="Super-Intelligent Internship Matcher",
	description="Enter your skills, education, interest, and location to get top 5 internship matches."
	)

	if __name__ == "__main__":
	demo.launch()