Spaces:
Sleeping
Sleeping
| """ | |
| skill_extractor.py (Production v5.0 - BTech All Roles) | |
| ------------------------------------------------------- | |
| Advanced Resume Skill Extractor: | |
| - Covers ALL common BTech career roles | |
| - Regex-based extraction (word-boundary safe) | |
| - Role-wise organized skill database | |
| - Smart normalization + inferred skills | |
| - Clean & deduplicated output | |
| """ | |
| import re | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ SKILLS DATABASE β BTech ALL ROLES | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| SKILLS_LIST = [ | |
| # ββ 1. SOFTWARE DEVELOPMENT βββββββββββββββ | |
| "python", "java", "javascript", "typescript", "c", "c++", "c#", | |
| "ruby", "php", "swift", "kotlin", "go", "rust", "scala", "r", "matlab", | |
| "html", "css", "react", "react.js", "next.js", "angular", "vue", | |
| "redux", "tailwind css", "framer motion", | |
| "node.js", "nodejs", "express", "express.js", | |
| "django", "flask", "fastapi", "spring boot", | |
| "rest api", "rest apis", "websockets", "graphql", | |
| "jwt", "oauth", "authentication", "authorization", | |
| "mvc", "state management", "api design", | |
| "prisma", "mongoose", | |
| "vercel", "netlify", | |
| "frontend development", "backend development", "full stack", "web development", | |
| # ββ 2. DATA SCIENCE / ML / AI βββββββββββββ | |
| "machine learning", "deep learning", "data science", "data analysis", | |
| "tensorflow", "pytorch", "keras", "scikit-learn", | |
| "pandas", "numpy", "matplotlib", "seaborn", "scipy", | |
| "nlp", "natural language processing", "computer vision", | |
| "opencv", "hugging face", "llm", "generative ai", "prompt engineering", | |
| "feature engineering", "model deployment", "mlops", | |
| "regression", "classification", "clustering", "neural network", | |
| "random forest", "xgboost", "time series", | |
| "artificial intelligence", "chatgpt", "copilot", | |
| # ββ 3. DATA ENGINEERING / ANALYTICS βββββββ | |
| "sql", "mysql", "postgresql", "mongodb", "redis", "firebase", | |
| "oracle", "sqlite", "dynamodb", "cassandra", | |
| "power bi", "tableau", "excel", "google sheets", | |
| "etl", "data pipeline", "apache spark", "hadoop", "kafka", | |
| "airflow", "dbt", "snowflake", "bigquery", "data warehouse", | |
| "data visualization", "business intelligence", | |
| # ββ 4. DEVOPS / CLOUD βββββββββββββββββββββ | |
| "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", | |
| "terraform", "ansible", "ci/cd", "linux", "bash", "shell scripting", | |
| "git", "github", "gitlab", "bitbucket", | |
| "nginx", "apache", "load balancing", "microservices", | |
| "serverless", "lambda", "cloud computing", | |
| # ββ 5. CYBERSECURITY ββββββββββββββββββββββ | |
| "network security", "ethical hacking", "penetration testing", | |
| "kali linux", "metasploit", "wireshark", "nmap", "burp suite", | |
| "cryptography", "ssl", "tls", "firewall", "ids", "ips", | |
| "siem", "soc", "vulnerability assessment", "owasp", | |
| "digital forensics", "malware analysis", "incident response", | |
| "information security", "cyber security", | |
| # ββ 6. EMBEDDED SYSTEMS / IOT βββββββββββββ | |
| "embedded c", "arduino", "raspberry pi", "stm32", "esp32", | |
| "rtos", "freertos", "uart", "spi", "i2c", "can bus", | |
| "iot", "mqtt", "zigbee", "bluetooth", "wifi module", | |
| "pcb design", "kicad", "altium", "proteus", "multisim", | |
| "microcontroller", "microprocessor", "fpga", "vhdl", "verilog", | |
| "signal processing", "sensor integration", | |
| # ββ 7. VLSI / CHIP DESIGN βββββββββββββββββ | |
| "vlsi", "verilog", "vhdl", "system verilog", | |
| "cadence", "synopsys", "mentor graphics", | |
| "rtl design", "synthesis", "sta", "place and route", | |
| "digital design", "analog design", "asic", "soc design", | |
| "dft", "timing analysis", "power analysis", | |
| # ββ 8. MECHANICAL ENGINEERING βββββββββββββ | |
| "autocad", "solidworks", "catia", "ansys", "creo", | |
| "fusion 360", "3d printing", "additive manufacturing", | |
| "cad", "cam", "cfd", "fea", "finite element analysis", | |
| "manufacturing processes", "cnc machining", | |
| "thermodynamics", "fluid mechanics", "heat transfer", | |
| "robotics", "automation", "plc", "scada", "hmi", | |
| "lean manufacturing", "six sigma", "quality control", | |
| # ββ 9. CIVIL ENGINEERING ββββββββββββββββββ | |
| "autocad civil", "staad pro", "etabs", "revit", "primavera", | |
| "ms project", "civil 3d", | |
| "structural analysis", "structural design", "rcc design", | |
| "surveying", "gis", "remote sensing", "arcgis", | |
| "construction management", "project planning", | |
| "soil mechanics", "geotechnical", "foundation design", | |
| "highway design", "transportation engineering", | |
| "water supply", "sanitation", "irrigation", | |
| # ββ 10. ELECTRICAL ENGINEERING ββββββββββββ | |
| "power systems", "power electronics", "circuit design", | |
| "matlab simulink", "pspice", "ltspice", "labview", | |
| "electric vehicles", "battery management system", | |
| "solar energy", "wind energy", "renewable energy", | |
| "transformer", "motor drives", "inverter", "rectifier", | |
| "control systems", "pid controller", | |
| "high voltage", "switchgear", "protection relay", | |
| "smart grid", "energy audit", | |
| # ββ 11. ELECTRONICS & COMMUNICATION βββββββ | |
| "signal processing", "dsp", "image processing", | |
| "communication systems", "wireless communication", | |
| "5g", "lte", "antenna design", "rf design", | |
| "hfss", "cst", "ads", | |
| "optical fiber", "photonics", | |
| "digital electronics", "analog electronics", | |
| "oscilloscope", "logic analyzer", | |
| # ββ 12. ROBOTICS / AUTOMATION βββββββββββββ | |
| "ros", "ros2", "gazebo", "slam", "path planning", | |
| "sensor fusion", "robotic arm", "drone", "uav", | |
| "autonomous systems", "control theory", "motion planning", | |
| "industrial automation", "plc programming", | |
| # ββ 13. PRODUCT MANAGEMENT (Tech) βββββββββ | |
| "product roadmap", "user stories", "agile", "scrum", "kanban", | |
| "jira", "confluence", "notion", "trello", | |
| "wireframing", "prototyping", "figma", "balsamiq", | |
| "market research", "competitive analysis", | |
| "a/b testing", "product analytics", "kpi tracking", | |
| "stakeholder management", "go to market", | |
| # ββ 14. UI/UX DESIGN ββββββββββββββββββββββ | |
| "figma", "adobe xd", "sketch", "invision", | |
| "photoshop", "illustrator", "after effects", | |
| "user research", "usability testing", | |
| "design thinking", "information architecture", | |
| "interaction design", "visual design", "typography", | |
| "color theory", "responsive design", "accessibility", | |
| # ββ 15. BUSINESS ANALYST / CONSULTING βββββ | |
| "requirement gathering", "brd", "frd", "use case", | |
| "uml", "flowchart", "process mapping", "gap analysis", | |
| "business analysis", "functional testing", "uat", | |
| # ββ 16. TESTING / QA ββββββββββββββββββββββ | |
| "manual testing", "automation testing", "selenium", | |
| "cypress", "playwright", "jest", "pytest", | |
| "test cases", "test plan", "bug tracking", | |
| "api testing", "postman", "jmeter", "load testing", | |
| "performance testing", "regression testing", | |
| "black box testing", "white box testing", | |
| # ββ 17. GAME DEVELOPMENT ββββββββββββββββββ | |
| "unity", "unreal engine", "godot", | |
| "game design", "level design", "3d modeling", | |
| "blender", "maya", "3ds max", | |
| "ar", "vr", "mixed reality", "xr", | |
| "physics simulation", "shader programming", | |
| # ββ 18. BLOCKCHAIN ββββββββββββββββββββββββ | |
| "blockchain", "solidity", "ethereum", "web3.js", "ethers.js", | |
| "smart contracts", "nft", "defi", "hyperledger", | |
| "cryptocurrency", "metamask", "truffle", "hardhat", | |
| "ipfs", "decentralized applications", "dapps", | |
| # ββ 19. SOFT SKILLS βββββββββββββββββββββββ | |
| "communication", "leadership", "teamwork", "problem solving", | |
| "project management", "critical thinking", | |
| "time management", "presentation", "collaboration", | |
| "analytical thinking", "attention to detail", | |
| # ββ 20. MOBILE DEVELOPMENT ββββββββββββββββ | |
| "react native", "flutter", "dart", "swiftui", "jetpack compose", | |
| "android development", "ios development", "mobile development", | |
| "xcode", "android studio", "expo", | |
| # ββ 21. DATA / MODERN TOOLS βββββββββββββββ | |
| "streamlit", "gradio", "langchain", "llamaindex", "pinecone", | |
| "chromadb", "weaviate", "vector database", "rag", | |
| "data lake", "lakehouse", "delta lake", "databricks", | |
| "looker", "metabase", "superset", | |
| # ββ 22. CLOUD CERTIFICATIONS & SERVICES βββ | |
| "aws lambda", "s3", "ec2", "ecs", "eks", "cloudfront", | |
| "azure devops", "azure functions", "cosmos db", | |
| "google cloud functions", "cloud run", "vertex ai", | |
| "heroku", "railway", "render", "fly.io", | |
| # ββ 23. API & ARCHITECTURE ββββββββββββββββ | |
| "grpc", "soap", "swagger", "openapi", | |
| "event driven", "message queue", "rabbitmq", | |
| "design patterns", "solid principles", "clean architecture", | |
| "domain driven design", "system design", | |
| # ββ 24. VERSION CONTROL & CI/CD ββββββββββ | |
| "github actions", "circleci", "travis ci", "argo cd", | |
| "helm", "prometheus", "grafana", "elk stack", | |
| "datadog", "new relic", "splunk", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # π₯ SKILL NORMALIZATION MAP | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| SKILL_MAP = { | |
| # Variations | |
| "react": ["react.js"], | |
| "node.js": ["nodejs"], | |
| "express": ["express.js"], | |
| "rest api": ["rest apis"], | |
| "html": ["html5"], | |
| "css": ["css3"], | |
| # Software Dev | |
| "full stack": ["mern", "fullstack", "mean"], | |
| "authentication": ["jwt", "oauth", "auth"], | |
| "frontend development": ["react", "next.js", "angular", "vue", "html", "css"], | |
| "backend development": ["node.js", "express", "django", "flask", "fastapi", "spring boot"], | |
| "version control": ["git", "github", "gitlab"], | |
| "database management": ["sql", "mysql", "postgresql", "mongodb", "redis", "firebase"], | |
| # AI / ML | |
| "deep learning": ["tensorflow", "keras", "pytorch"], | |
| "data science": ["pandas", "numpy", "data analysis", "machine learning"], | |
| "nlp": ["llm", "natural language processing", "text classification", "tokenization", "hugging face"], | |
| "machine learning": ["ml", "scikit-learn", "model training", "regression", "classification", "xgboost"], | |
| "artificial intelligence": ["ai", "neural network", "deep learning", "machine learning", "generative ai"], | |
| "computer vision": ["image recognition", "object detection", "opencv", "cnn"], | |
| "mlops": ["model deployment", "mlflow", "kubeflow"], | |
| "seaborn": ["matplotlib"], | |
| "classification": ["classify", "classification", "predict", "model training"], | |
| # DevOps / Cloud | |
| "devops": ["ci/cd", "docker", "kubernetes", "jenkins", "terraform", "ansible"], | |
| "cloud computing": ["aws", "azure", "gcp", "serverless", "lambda"], | |
| # Security | |
| "cyber security": ["ethical hacking", "penetration testing", "network security", "owasp"], | |
| "ethical hacking": ["kali linux", "metasploit", "burp suite", "nmap"], | |
| # Embedded / IoT | |
| "iot": ["mqtt", "arduino", "raspberry pi", "esp32", "zigbee"], | |
| "embedded systems": ["embedded c", "rtos", "microcontroller", "stm32", "esp32"], | |
| # VLSI | |
| "vlsi": ["verilog", "vhdl", "system verilog", "rtl design", "asic"], | |
| # Mechanical | |
| "cad": ["autocad", "solidworks", "catia", "creo", "fusion 360"], | |
| "simulation": ["ansys", "fea", "cfd", "matlab simulink"], | |
| "automation": ["plc", "scada", "hmi", "industrial automation"], | |
| "lean manufacturing": ["six sigma", "quality control", "kaizen"], | |
| # Civil | |
| "structural design": ["staad pro", "etabs", "rcc design"], | |
| "gis": ["arcgis", "remote sensing", "civil 3d"], | |
| # Electrical | |
| "power electronics": ["inverter", "rectifier", "motor drives", "battery management system"], | |
| "renewable energy": ["solar energy", "wind energy", "electric vehicles"], | |
| "control systems": ["pid controller", "matlab simulink", "labview"], | |
| # Robotics | |
| "robotics": ["ros", "ros2", "slam", "path planning", "robotic arm", "drone"], | |
| # Testing | |
| "automation testing": ["selenium", "cypress", "playwright", "jest", "pytest"], | |
| "api testing": ["postman", "jmeter"], | |
| # Game Dev | |
| "game development": ["unity", "unreal engine", "godot", "game design"], | |
| "3d modeling": ["blender", "maya", "3ds max"], | |
| "vr": ["virtual reality", "oculus", "steamvr"], | |
| "ar": ["augmented reality", "arkit", "arcore"], | |
| # Blockchain | |
| "blockchain": ["solidity", "ethereum", "smart contracts", "web3.js", "dapps"], | |
| # Data / Analytics | |
| "business intelligence": ["power bi", "tableau", "data visualization"], | |
| "data pipeline": ["apache spark", "kafka", "airflow", "etl"], | |
| "data warehouse": ["snowflake", "bigquery", "redshift"], | |
| # Mobile | |
| "mobile development": ["react native", "flutter", "android development", "ios development"], | |
| "android development": ["kotlin", "jetpack compose", "android studio"], | |
| "ios development": ["swift", "swiftui", "xcode"], | |
| # Modern AI/Data | |
| "rag": ["langchain", "llamaindex", "vector database", "chromadb", "pinecone"], | |
| "generative ai": ["llm", "chatgpt", "copilot", "prompt engineering", "rag", "langchain"], | |
| # Product / Design | |
| "agile": ["scrum", "kanban", "jira", "sprint"], | |
| "ui/ux": ["figma", "adobe xd", "wireframing", "prototyping", "user research"], | |
| # Soft skills | |
| "collaboration": ["collaborated", "teamwork", "team", "worked with"], | |
| # Feature engineering | |
| "feature engineering": [ | |
| "data preprocessing", | |
| "data cleaning", | |
| "feature extraction", | |
| "data transformation" | |
| ], | |
| # Teamwork | |
| "teamwork": [ | |
| "team", | |
| "collaborated", | |
| "community", | |
| "worked with", | |
| "coordinated" | |
| ], | |
| # Analytical thinking | |
| "analytical thinking": [ | |
| "data analysis", | |
| "problem solving", | |
| "analysis", | |
| "model training" | |
| ] | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ EDUCATION KEYWORDS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| EDUCATION_KEYWORDS = [ | |
| "bachelor", "master", "phd", "doctorate", "diploma", | |
| "b.tech", "m.tech", "b.sc", "m.sc", "b.e", "m.e", | |
| "bca", "mca", "bba", "mba", "b.com", "m.com", | |
| "university", "college", "institute", "school", | |
| "computer science", "information technology", | |
| "engineering", "mathematics", "physics", "chemistry", | |
| "electronics", "electrical", "mechanical", "civil", | |
| "degree", "graduation", "post graduation", "certification", | |
| "12th", "10th", "higher secondary", "secondary", | |
| "iit", "nit", "iiit", "bits", "vit", "lpu", "amity", | |
| "cgpa", "gpa", "percentage", "aggregate", | |
| "coursework", "specialization", "minor", "major", | |
| "data science", "artificial intelligence", | |
| "biotechnology", "biomedical", "chemical engineering", | |
| "aerospace", "automobile", "industrial engineering", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ EXPERIENCE KEYWORDS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| EXPERIENCE_KEYWORDS = [ | |
| "experience", "years of experience", "worked at", "working at", | |
| "intern", "internship", "fresher", "junior", "senior", | |
| "lead", "manager", "director", "team lead", | |
| "full time", "part time", "freelance", "contract", | |
| "responsibilities", "achievements", "projects", | |
| "developed", "implemented", "designed", "managed", | |
| "built", "created", "maintained", "optimized", | |
| "deployed", "architected", "collaborated", "researched", | |
| "analyzed", "tested", "automated", "integrated", | |
| "mentored", "supervised", "coordinated", "delivered", | |
| "contributed", "spearheaded", "launched", "scaled", | |
| "improved", "reduced", "increased", "streamlined", | |
| "training", "workshop", "hackathon", "open source", | |
| "startup", "company", "organization", "firm", | |
| "software engineer", "data analyst", "web developer", | |
| "ml engineer", "devops engineer", "full stack developer", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ HELPER FUNCTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def _match(keyword: str, text: str) -> bool: | |
| """Word-boundary safe keyword match.""" | |
| pattern = r"(?<![a-z0-9])" + re.escape(keyword.lower()) + r"(?![a-z0-9])" | |
| return bool(re.search(pattern, text)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ SKILL EXTRACTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_skills(text: str) -> list[str]: | |
| """ | |
| Extract skills from resume text using regex keyword matching. | |
| Args: | |
| text (str): Cleaned resume text. | |
| Returns: | |
| list[str]: Skills found in the text. | |
| """ | |
| text_lower = text.lower() | |
| return [skill for skill in SKILLS_LIST if _match(skill, text_lower)] | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # π₯ NORMALIZATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def normalize_skills(skills: list[str], text: str) -> list[str]: | |
| """ | |
| Infer higher-level skills from related keywords found in text. | |
| Example: "tensorflow" found β "deep learning" automatically added. | |
| Args: | |
| skills (list[str]): Already extracted raw skills. | |
| text (str): Original resume text. | |
| Returns: | |
| list[str]: Expanded, deduplicated, sorted skill list. | |
| """ | |
| normalized = set(skills) | |
| text_lower = text.lower() | |
| for main_skill, related_keywords in SKILL_MAP.items(): | |
| for keyword in related_keywords: | |
| if _match(keyword, text_lower): | |
| normalized.add(main_skill) | |
| break | |
| return sorted(normalized) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ EDUCATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_education(text: str) -> list[str]: | |
| """ | |
| Find education-related keywords in resume text. | |
| Args: | |
| text (str): Cleaned resume text. | |
| Returns: | |
| list[str]: Education keywords found. | |
| """ | |
| text_lower = text.lower() | |
| return list(set([kw for kw in EDUCATION_KEYWORDS if _match(kw, text_lower)])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ EXPERIENCE | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_experience(text: str) -> list[str]: | |
| """ | |
| Find experience-related keywords in resume text. | |
| Args: | |
| text (str): Cleaned resume text. | |
| Returns: | |
| list[str]: Experience indicators found. | |
| """ | |
| text_lower = text.lower() | |
| return list(set([kw for kw in EXPERIENCE_KEYWORDS if _match(kw, text_lower)])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # πΉ MAIN PIPELINE | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_all(text: str) -> dict: | |
| """ | |
| Full extraction pipeline: | |
| 1. Extract raw skills via regex | |
| 2. Normalize using SKILL_MAP inference | |
| 3. Extract education and experience | |
| Args: | |
| text (str): Cleaned resume text. | |
| Returns: | |
| dict: { | |
| "skills": [...], | |
| "education": [...], | |
| "experience": [...] | |
| } | |
| """ | |
| raw_skills = extract_skills(text) | |
| final_skills = normalize_skills(raw_skills, text) | |
| return { | |
| "skills": final_skills, | |
| "education": extract_education(text), | |
| "experience": extract_experience(text), | |
| } |