# Helper utilities for the project def parse_skill_match_score(score_str: str) -> int: """Extract numeric score from string""" import re match = re.search(r'\d+', score_str) return int(match.group(0)) if match else 50 def format_experience_duration(years_str: str) -> str: """Standardize experience duration format""" import re match = re.search(r'\d+', years_str) if match: years = int(match.group(0)) return f"{years} years" return years_str def clean_text(text: str) -> str: """Clean and normalize text""" import re # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters text = re.sub(r'[^\w\s\-@.]', '', text) return text.strip() def skill_similarity(skill1: str, skill2: str) -> float: """Calculate similarity between two skills""" from difflib import SequenceMatcher return SequenceMatcher(None, skill1.lower(), skill2.lower()).ratio() def batch_process(items: list, batch_size: int = 32): """Process items in batches""" for i in range(0, len(items), batch_size): yield items[i:i+batch_size] # Model conversion utilities def convert_to_onnx(model_path: str, output_path: str): """Convert fine-tuned model to ONNX format for faster inference""" from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) # Export to ONNX import torch dummy_input = torch.tensor([[tokenizer.eos_token_id]]) torch.onnx.export( model, dummy_input, output_path, input_names=['input_ids'], output_names=['output'], dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}}, opset_version=12 ) print(f"✅ Model exported to {output_path}")