Spaces:

harshlaghave
/

internship_matcher

Sleeping

File size: 4,861 Bytes

5b74b8c
e32d554
5b74b8c
e32d554
add527f
5b74b8c
 
 
 
add527f
5b74b8c
351c4ea
e32d554
 
5b74b8c
 
 
 
 
add527f
 
 
 
5b74b8c
 
 
 
add527f
5b74b8c
add527f
5b74b8c
 
 
 
 
 
 
 
 
add527f
 
5b74b8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add527f
5b74b8c
 
 
e32d554
5b74b8c
e32d554
 
 
 
 
5b74b8c
 
 
 
 
 
 
 
 
 
 
 
add527f
5b74b8c
add527f
5b74b8c
 
add527f
5b74b8c
 
add527f
5b74b8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e32d554

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable online logging

import gradio as gr
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# ------------------------
# Load CSV and pre-fine-tuned model
# ------------------------
MODEL_PATH = os.path.join(os.path.dirname(__file__), "fine_tuned_internship_model")
model = SentenceTransformer(MODEL_PATH)
df = pd.read_csv("converted (1).csv")

# ------------------------
# Normalization functions
# ------------------------
SKILL_MAP = {
    "js": "javascript", "reactjs": "react", "nodejs": "node.js",
    "cpp": "c++", "btech": "bachelor of technology",
    "cs": "computer science", "ece": "electronics and communication",
    "ml": "machine learning", "ai": "artificial intelligence",
    "it": "information technology"
}

def normalize_text(text):
    if not text: return ""
    text = text.lower().replace(".", "").replace(",", "")
    for k,v in SKILL_MAP.items(): text = text.replace(k,v)
    return text.strip()

def normalize_skills(skills_str):
    skills = [normalize_text(s.strip()) for s in skills_str.split(",")]
    return [s for s in skills if s]

def location_similarity(candidate_loc, internship_loc):
    candidate_loc = candidate_loc.lower().strip()
    internship_loc = internship_loc.lower().strip()
    if candidate_loc == internship_loc: return 100
    if candidate_loc.split()[0] == internship_loc.split()[0]: return 70
    return 0

# ------------------------
# Cache embeddings for all internships
# ------------------------
cached_internships = []
for _, row in df.iterrows():
    internship_skills = normalize_skills(row['Required_Skills'])
    internship_skill_embs = [model.encode(s, convert_to_tensor=True) for s in internship_skills]
    internship_edu_emb = model.encode(normalize_text(row['Student_Education']), convert_to_tensor=True)
    internship_interest_emb = model.encode(normalize_text(row['Student_Interest']), convert_to_tensor=True)
    cached_internships.append({
        "row": row,
        "skill_embs": internship_skill_embs,
        "edu_emb": internship_edu_emb,
        "interest_emb": internship_interest_emb
    })

weights = np.array([0.4,0.3,0.2,0.1])  # Skills, Education, Interest, Location
intercept = 0

# ------------------------
# Matching function
# ------------------------
def match_internship(skills, education, interest, location):
    candidate_skills_input = normalize_skills(skills)
    candidate_education_input = normalize_text(education)
    candidate_interest_input = normalize_text(interest)
    candidate_location_input = location

    candidate_skill_embs_input = [model.encode(s, convert_to_tensor=True) for s in candidate_skills_input]
    candidate_edu_emb_input = model.encode(candidate_education_input, convert_to_tensor=True)
    candidate_interest_emb_input = model.encode(candidate_interest_input, convert_to_tensor=True)

    results = []
    for internship in cached_internships:
        row = internship["row"]

        # Skill similarity
        skill_sims = []
        for c_emb in candidate_skill_embs_input:
            max_sim = max([util.cos_sim(c_emb, i_emb).item() for i_emb in internship["skill_embs"]], default=0)
            skill_sims.append(max_sim)
        skills_sim = np.mean(skill_sims)*100 if skill_sims else 0

        # Education similarity
        edu_sim = util.cos_sim(candidate_edu_emb_input, internship["edu_emb"]).item()*100

        # Interest similarity
        interest_sim = util.cos_sim(candidate_interest_emb_input, internship["interest_emb"]).item()*100

        # Location similarity
        loc_sim = location_similarity(candidate_location_input, row['Location'])

        # Overall match
        overall = np.dot(weights, [skills_sim, edu_sim, interest_sim, loc_sim]) + intercept

        results.append({
            "Company": row['Company'],
            "Position": row['Position'],
            "Skills_Match": skills_sim,
            "Education_Match": edu_sim,
            "Interest_Match": interest_sim,
            "Location_Match": loc_sim,
            "Overall_Match": overall
        })

    results.sort(key=lambda x: x['Overall_Match'], reverse=True)
    # Return top 5 as list of dicts
    return results[:5]

# ------------------------
# Gradio interface
# ------------------------
inputs = [
    gr.Textbox(label="Your Skills (comma-separated)"),
    gr.Textbox(label="Your Education"),
    gr.Textbox(label="Your Interest / Field"),
    gr.Textbox(label="Your Location")
]

outputs = gr.JSON(label="Top 5 Internship Matches")

demo = gr.Interface(
    fn=match_internship,
    inputs=inputs,
    outputs=outputs,
    title="Super-Intelligent Internship Matcher",
    description="Enter your skills, education, interest, and location to get top 5 internship matches."
)

if __name__ == "__main__":
    demo.launch()