import os os.environ["WANDB_DISABLED"] = "true" # Disable online logging import gradio as gr from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np # ------------------------ # Load CSV and pre-fine-tuned model # ------------------------ MODEL_PATH = os.path.join(os.path.dirname(__file__), "fine_tuned_internship_model") model = SentenceTransformer(MODEL_PATH) df = pd.read_csv("converted (1).csv") # ------------------------ # Normalization functions # ------------------------ SKILL_MAP = { "js": "javascript", "reactjs": "react", "nodejs": "node.js", "cpp": "c++", "btech": "bachelor of technology", "cs": "computer science", "ece": "electronics and communication", "ml": "machine learning", "ai": "artificial intelligence", "it": "information technology" } def normalize_text(text): if not text: return "" text = text.lower().replace(".", "").replace(",", "") for k,v in SKILL_MAP.items(): text = text.replace(k,v) return text.strip() def normalize_skills(skills_str): skills = [normalize_text(s.strip()) for s in skills_str.split(",")] return [s for s in skills if s] def location_similarity(candidate_loc, internship_loc): candidate_loc = candidate_loc.lower().strip() internship_loc = internship_loc.lower().strip() if candidate_loc == internship_loc: return 100 if candidate_loc.split()[0] == internship_loc.split()[0]: return 70 return 0 # ------------------------ # Cache embeddings for all internships # ------------------------ cached_internships = [] for _, row in df.iterrows(): internship_skills = normalize_skills(row['Required_Skills']) internship_skill_embs = [model.encode(s, convert_to_tensor=True) for s in internship_skills] internship_edu_emb = model.encode(normalize_text(row['Student_Education']), convert_to_tensor=True) internship_interest_emb = model.encode(normalize_text(row['Student_Interest']), convert_to_tensor=True) cached_internships.append({ "row": row, "skill_embs": internship_skill_embs, "edu_emb": internship_edu_emb, "interest_emb": internship_interest_emb }) weights = np.array([0.4,0.3,0.2,0.1]) # Skills, Education, Interest, Location intercept = 0 # ------------------------ # Matching function # ------------------------ def match_internship(skills, education, interest, location): candidate_skills_input = normalize_skills(skills) candidate_education_input = normalize_text(education) candidate_interest_input = normalize_text(interest) candidate_location_input = location candidate_skill_embs_input = [model.encode(s, convert_to_tensor=True) for s in candidate_skills_input] candidate_edu_emb_input = model.encode(candidate_education_input, convert_to_tensor=True) candidate_interest_emb_input = model.encode(candidate_interest_input, convert_to_tensor=True) results = [] for internship in cached_internships: row = internship["row"] # Skill similarity skill_sims = [] for c_emb in candidate_skill_embs_input: max_sim = max([util.cos_sim(c_emb, i_emb).item() for i_emb in internship["skill_embs"]], default=0) skill_sims.append(max_sim) skills_sim = np.mean(skill_sims)*100 if skill_sims else 0 # Education similarity edu_sim = util.cos_sim(candidate_edu_emb_input, internship["edu_emb"]).item()*100 # Interest similarity interest_sim = util.cos_sim(candidate_interest_emb_input, internship["interest_emb"]).item()*100 # Location similarity loc_sim = location_similarity(candidate_location_input, row['Location']) # Overall match overall = np.dot(weights, [skills_sim, edu_sim, interest_sim, loc_sim]) + intercept results.append({ "Company": row['Company'], "Position": row['Position'], "Skills_Match": skills_sim, "Education_Match": edu_sim, "Interest_Match": interest_sim, "Location_Match": loc_sim, "Overall_Match": overall }) results.sort(key=lambda x: x['Overall_Match'], reverse=True) # Return top 5 as list of dicts return results[:5] # ------------------------ # Gradio interface # ------------------------ inputs = [ gr.Textbox(label="Your Skills (comma-separated)"), gr.Textbox(label="Your Education"), gr.Textbox(label="Your Interest / Field"), gr.Textbox(label="Your Location") ] outputs = gr.JSON(label="Top 5 Internship Matches") demo = gr.Interface( fn=match_internship, inputs=inputs, outputs=outputs, title="Super-Intelligent Internship Matcher", description="Enter your skills, education, interest, and location to get top 5 internship matches." ) if __name__ == "__main__": demo.launch()