Spaces:

Chaitanya895
/

SkillSync

Sleeping

App Files Files Community

SkillSync / ml_utils.py

Chaitanya895

Added all updated files

e29e4c8 3 months ago

raw

history blame contribute delete

30.6 kB

	"""
	Advanced NLP/ML Utilities for SkillSync
	This module contains all the intelligent ML features for enhanced resume matching,
	scoring, prediction, and recommendations.
	"""

	import os
	import numpy as np
	import pandas as pd

	# Disable TensorFlow logging to avoid Keras warnings
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	# Try to import sentence_transformers with better error handling
	try:
	from sentence_transformers import SentenceTransformer, util
	_SENT_TRANSFORMERS_AVAILABLE = True
	except Exception as e:
	# sentence_transformers not available — provide safe fallbacks so static analysis
	# won't fail and runtime code can fallback to simpler heuristics.
	print(f"Warning: sentence_transformers not available: {str(e)}")
	SentenceTransformer = None
	_SENT_TRANSFORMERS_AVAILABLE = False

	class _UtilFallback:
	@staticmethod
	def pytorch_cos_sim(a, b):
	# Basic numpy cosine similarity fallback that provides an .item() method
	try:
	a_np = np.array(a)
	b_np = np.array(b)
	if a_np.ndim == 1:
	a_np = a_np.reshape(1, -1)
	if b_np.ndim == 1:
	b_np = b_np.reshape(1, -1)
	num = (a_np * b_np).sum(axis=1)
	denom = np.linalg.norm(a_np, axis=1) * np.linalg.norm(b_np, axis=1)
	denom = np.where(denom == 0, 1e-8, denom)
	sim = num / denom

	class _Sim:
	def __init__(self, v):
	self._v = v
	def item(self):
	try:
	return float(self._v[0])
	except Exception:
	return float(self._v)

	return _Sim(sim)
	except Exception:
	class _ZeroSim:
	def item(self): return 0.0
	return _ZeroSim()

	util = _UtilFallback()
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics.pairwise import cosine_similarity
	try:
	import xgboost as xgb
	_XGB_AVAILABLE = True
	except Exception as e:
	print(f"Warning: xgboost not available: {str(e)}")
	xgb = None
	_XGB_AVAILABLE = False
	try:
	import joblib
	_JOBLIB_AVAILABLE = True
	except Exception:
	_JOBLIB_AVAILABLE = False
	import logging
	from collections import Counter
	from typing import List, Dict, Tuple
	import re
	try:
	import textstat
	_TEXTSTAT_AVAILABLE = True
	except Exception:
	_TEXTSTAT_AVAILABLE = False

	class _TextstatFallback:
	@staticmethod
	def flesch_reading_ease(text):
	"""
	Lightweight fallback for textstat.flesch_reading_ease using a simple heuristic:
	- estimate sentence count by splitting on punctuation,
	- estimate word count via word tokens,
	- estimate syllables by counting vowel groups per word.
	This provides a rough readability score when textstat is unavailable.
	"""
	# Basic sentence and word tokenization
	sentences = re.split(r'[.!?]+', text)
	sentences = [s for s in sentences if s.strip()]
	words = re.findall(r'\w+', text)
	word_count = len(words) or 1
	sentence_count = max(1, len(sentences))
	# Estimate syllables as number of vowel groups per word
	syllables = sum(len(re.findall(r'[aeiouy]+', w.lower())) for w in words) or 1
	asl = word_count / sentence_count # average sentence length
	asw = syllables / word_count # average syllables per word
	# Flesch reading ease formula approximation
	score = 206.835 - (1.015 * asl) - (84.6 * asw)
	return score

	textstat = _TextstatFallback()

	try:
	from fuzzywuzzy import fuzz
	_FUZZYWUZZY_AVAILABLE = True
	except Exception as e:
	print(f"Warning: fuzzywuzzy not available: {str(e)}")
	_FUZZYWUZZY_AVAILABLE = False
	class _FuzzFallback:
	@staticmethod
	def ratio(s1, s2):
	# Simple Levenshtein distance fallback
	if s1 == s2:
	return 100
	return 50
	fuzz = _FuzzFallback()

	# Configure cache directory for models
	MODELS_CACHE = os.getenv('TRANSFORMERS_CACHE', '/tmp/hf_cache')

	# Global model instances (lazy loading)
	_semantic_model = None
	_sentiment_analyzer = None
	_ner_model = None

	def get_semantic_model():
	"""Load or return cached sentence transformer model"""
	global _semantic_model
	if _semantic_model is None:
	try:
	_semantic_model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder=MODELS_CACHE)
	logging.info("Semantic model loaded successfully")
	except Exception as e:
	logging.error(f"Error loading semantic model: {str(e)}")
	_semantic_model = None
	return _semantic_model

	def get_sentiment_analyzer():
	"""Load or return cached sentiment analysis pipeline"""
	global _sentiment_analyzer
	if _sentiment_analyzer is None:
	try:
	# Set TensorFlow to not be required for transformers
	os.environ['TRANSFORMERS_NO_TF'] = '1'
	from transformers import pipeline
	_sentiment_analyzer = pipeline(
	"sentiment-analysis",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	device=-1, # CPU
	framework='pt' # Force PyTorch backend
	)
	logging.info("Sentiment analyzer loaded successfully")
	except Exception as e:
	logging.warning(f"Sentiment analyzer not available: {str(e)}")
	_sentiment_analyzer = None
	return _sentiment_analyzer

	def get_ner_model():
	"""Load or return cached NER model using spaCy-like transformers"""
	global _ner_model
	if _ner_model is None:
	try:
	# Set TensorFlow to not be required for transformers
	os.environ['TRANSFORMERS_NO_TF'] = '1'
	from transformers import pipeline
	_ner_model = pipeline(
	"ner",
	model="dslim/bert-base-NER",
	aggregation_strategy="simple",
	device=-1, # CPU
	framework='pt' # Force PyTorch backend
	)
	logging.info("NER model loaded successfully")
	except Exception as e:
	logging.warning(f"NER model not available: {str(e)}")
	_ner_model = None
	return _ner_model


	# ============================================================================
	# 1. SEMANTIC MATCHING ENGINE
	# ============================================================================

	def semantic_similarity(text1: str, text2: str) -> float:
	"""
	Calculate semantic similarity between two texts using sentence transformers
	Returns similarity score between 0 and 1
	"""
	model = get_semantic_model()
	if model is None:
	# Fallback to simple word overlap
	words1 = set(text1.lower().split())
	words2 = set(text2.lower().split())
	if not words1 or not words2:
	return 0.0
	return len(words1 & words2) / len(words1 \| words2)

	try:
	embedding1 = model.encode(text1, convert_to_tensor=True)
	embedding2 = model.encode(text2, convert_to_tensor=True)
	similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
	return max(0.0, min(1.0, similarity))
	except Exception as e:
	logging.error(f"Error in semantic similarity: {str(e)}")
	return 0.0

	def enhanced_skill_matching(user_skills: List[str], required_skills: List[str]) -> Dict:
	"""
	Advanced skill matching using semantic similarity
	Returns detailed match information
	"""
	if not user_skills or not required_skills:
	return {
	'overall_score': 0.0,
	'matched_skills': [],
	'missing_skills': required_skills,
	'semantic_matches': []
	}

	user_skills_text = ' '.join(user_skills)
	required_skills_text = ' '.join(required_skills)

	# Overall semantic similarity
	overall_score = semantic_similarity(user_skills_text, required_skills_text)

	# Individual skill matching
	matched = []
	missing = []
	semantic_matches = []

	for req_skill in required_skills:
	best_match_score = 0.0
	best_match_skill = None

	for user_skill in user_skills:
	score = semantic_similarity(user_skill, req_skill)
	if score > best_match_score:
	best_match_score = score
	best_match_skill = user_skill

	if best_match_score > 0.7: # Strong match threshold
	matched.append(req_skill)
	if best_match_skill != req_skill:
	semantic_matches.append({
	'required': req_skill,
	'user_has': best_match_skill,
	'score': round(best_match_score, 3)
	})
	else:
	missing.append(req_skill)

	return {
	'overall_score': round(overall_score, 3),
	'matched_skills': matched,
	'missing_skills': missing,
	'semantic_matches': semantic_matches,
	'match_percentage': round((len(matched) / len(required_skills)) * 100, 1)
	}


	# ============================================================================
	# 2. INTELLIGENT SKILL EXTRACTION WITH NER
	# ============================================================================

	# Comprehensive skill keywords database
	TECHNICAL_SKILLS = {
	'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php', 'swift', 'kotlin',
	'go', 'rust', 'typescript', 'scala', 'r', 'matlab', 'perl'],
	'web': ['html', 'css', 'react', 'angular', 'vue', 'node.js', 'django', 'flask', 'spring',
	'express', 'fastapi', 'next.js', 'nuxt.js', 'svelte'],
	'database': ['sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'cassandra',
	'oracle', 'dynamodb', 'firebase'],
	'ml_ai': ['machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras', 'scikit-learn',
	'nlp', 'computer vision', 'neural networks', 'transformers', 'bert', 'gpt'],
	'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins', 'ci/cd',
	'microservices', 'serverless'],
	'tools': ['git', 'github', 'gitlab', 'jira', 'confluence', 'slack', 'vscode', 'intellij']
	}

	SOFT_SKILLS = ['leadership', 'communication', 'teamwork', 'problem solving', 'critical thinking',
	'time management', 'adaptability', 'creativity', 'collaboration', 'negotiation',
	'public speaking', 'presentation', 'analytical', 'detail-oriented', 'self-motivated']

	def extract_skills_intelligent(text: str) -> Dict[str, List[str]]:
	"""
	Extract skills using NER and fuzzy matching
	Returns categorized skills
	"""
	if not text:
	return {'technical': [], 'soft': [], 'all': []}

	text_lower = text.lower()
	technical_skills = []
	soft_skills = []

	# Extract using fuzzy matching
	all_technical = [skill for category in TECHNICAL_SKILLS.values() for skill in category]

	for skill in all_technical:
	if re.search(r'\b' + re.escape(skill) + r'\b', text_lower):
	technical_skills.append(skill)
	else:
	# Fuzzy match for variations
	words = text_lower.split()
	for word in words:
	if fuzz.ratio(skill, word) > 85:
	technical_skills.append(skill)
	break

	for skill in SOFT_SKILLS:
	if re.search(r'\b' + re.escape(skill) + r'\b', text_lower):
	soft_skills.append(skill)

	# Try NER extraction for additional entities
	ner_model = get_ner_model()
	if ner_model:
	try:
	entities = ner_model(text[:512]) # Limit text length
	for entity in entities:
	if entity['entity_group'] in ['ORG', 'MISC']:
	word = entity['word'].lower().strip()
	if len(word) > 2 and word not in technical_skills:
	technical_skills.append(word)
	except Exception as e:
	logging.warning(f"NER extraction warning: {str(e)}")

	return {
	'technical': list(set(technical_skills)),
	'soft': list(set(soft_skills)),
	'all': list(set(technical_skills + soft_skills))
	}


	# ============================================================================
	# 3. AI-POWERED RESUME SCORING
	# ============================================================================

	def calculate_resume_score(resume_data: Dict, job_description: str = None) -> Dict:
	"""
	Comprehensive resume scoring with multiple dimensions
	"""
	score_breakdown = {}

	# 1. Completeness Score (0-25 points)
	required_fields = ['skills', 'experience', 'education', 'phone_number', 'email']
	filled_fields = sum(1 for field in required_fields if resume_data.get(field))
	completeness_score = (filled_fields / len(required_fields)) * 25
	score_breakdown['completeness'] = round(completeness_score, 1)

	# 2. Skills Depth Score (0-25 points)
	skills_text = resume_data.get('skills', '')
	extracted_skills = extract_skills_intelligent(skills_text)
	technical_count = len(extracted_skills['technical'])
	soft_count = len(extracted_skills['soft'])
	skills_depth = min(25, (technical_count * 2 + soft_count) * 1.5)
	score_breakdown['skills_depth'] = round(skills_depth, 1)

	# 3. Experience Quality Score (0-25 points)
	experience = resume_data.get('experience', '')
	experience_score = 0
	if experience:
	# Check for quantifiable achievements (numbers, percentages)
	numbers = re.findall(r'\d+', experience)
	experience_score += min(10, len(numbers) * 2)
	# Check for action verbs
	action_verbs = ['developed', 'created', 'managed', 'led', 'implemented', 'designed',
	'built', 'optimized', 'increased', 'improved']
	found_verbs = sum(1 for verb in action_verbs if verb in experience.lower())
	experience_score += min(10, found_verbs * 2)
	# Length check
	if len(experience) > 100:
	experience_score += 5
	score_breakdown['experience_quality'] = round(experience_score, 1)

	# 4. Job Match Score (0-25 points) - if job description provided
	job_match_score = 0
	if job_description:
	resume_text = ' '.join([str(resume_data.get(field, '')) for field in
	['skills', 'experience', 'education', 'certifications']])
	job_match_score = semantic_similarity(resume_text, job_description) * 25
	else:
	# Default to skills assessment
	job_match_score = min(25, technical_count * 2)
	score_breakdown['job_match'] = round(job_match_score, 1)

	# Total Score
	total_score = sum(score_breakdown.values())

	# Generate recommendations
	recommendations = []
	if completeness_score < 20:
	missing = [f for f in required_fields if not resume_data.get(f)]
	recommendations.append(f"Complete missing sections: {', '.join(missing)}")
	if skills_depth < 15:
	recommendations.append("Add more technical skills and certifications")
	if experience_score < 15:
	recommendations.append("Use action verbs and quantify achievements (e.g., 'Increased efficiency by 30%')")
	if technical_count < 5:
	recommendations.append("List at least 5-7 technical skills relevant to your field")

	return {
	'total_score': round(total_score, 1),
	'grade': get_grade(total_score),
	'breakdown': score_breakdown,
	'recommendations': recommendations,
	'technical_skills_count': technical_count,
	'soft_skills_count': soft_count
	}

	def get_grade(score: float) -> str:
	"""Convert score to letter grade"""
	if score >= 90:
	return 'A+ (Excellent)'
	elif score >= 80:
	return 'A (Very Good)'
	elif score >= 70:
	return 'B (Good)'
	elif score >= 60:
	return 'C (Fair)'
	else:
	return 'D (Needs Improvement)'


	# ============================================================================
	# 4. INTERVIEW RESPONSE ANALYSIS
	# ============================================================================

	def analyze_interview_response(question: str, response: str) -> Dict:
	"""
	Analyze interview response using NLP metrics
	"""
	if not response or len(response.strip()) < 10:
	return {
	'score': 0,
	'feedback': 'Response too short. Please provide more detail.',
	'metrics': {}
	}

	metrics = {}

	# 1. Length analysis
	word_count = len(response.split())
	metrics['word_count'] = word_count
	length_score = min(20, (word_count / 10)) # Optimal: 100-200 words

	# 2. Readability
	try:
	flesch_score = textstat.flesch_reading_ease(response)
	metrics['readability'] = round(flesch_score, 1)
	readability_score = 15 if 60 <= flesch_score <= 80 else 10
	except:
	readability_score = 10

	# 3. Sentiment analysis
	sentiment_analyzer = get_sentiment_analyzer()
	sentiment_score = 0
	if sentiment_analyzer:
	try:
	sentiment = sentiment_analyzer(response[:512])[0]
	metrics['sentiment'] = sentiment['label']
	metrics['confidence'] = round(sentiment['score'], 2)
	# Positive sentiment indicates confidence
	sentiment_score = 15 if sentiment['label'] == 'POSITIVE' else 10
	except:
	sentiment_score = 10
	else:
	sentiment_score = 10

	# 4. Structure analysis (STAR method for behavioral questions)
	star_keywords = {
	'situation': ['situation', 'context', 'background', 'scenario'],
	'task': ['task', 'challenge', 'problem', 'goal', 'objective'],
	'action': ['action', 'did', 'implemented', 'developed', 'created', 'solved'],
	'result': ['result', 'outcome', 'achieved', 'improved', 'increased', 'success']
	}

	response_lower = response.lower()
	star_found = {key: any(kw in response_lower for kw in keywords)
	for key, keywords in star_keywords.items()}
	structure_score = sum(star_found.values()) * 5
	metrics['star_method'] = star_found

	# 5. Technical content (check for technical terms)
	technical_terms = extract_skills_intelligent(response)
	technical_score = min(20, len(technical_terms['technical']) * 3)
	metrics['technical_terms_found'] = len(technical_terms['technical'])

	# Total score
	total_score = length_score + readability_score + sentiment_score + structure_score + technical_score

	# Generate feedback
	feedback = []
	if word_count < 50:
	feedback.append("Provide more detailed responses (aim for 100-150 words)")
	if sum(star_found.values()) < 3:
	feedback.append("Use STAR method: Describe Situation, Task, Action, and Result")
	if technical_score < 10:
	feedback.append("Include relevant technical details and specific examples")
	if not feedback:
	feedback.append("Great response! Clear, detailed, and well-structured.")

	return {
	'score': round(min(100, total_score), 1),
	'grade': get_grade(total_score),
	'feedback': ' \| '.join(feedback),
	'metrics': metrics
	}


	# ============================================================================
	# 5. PREDICTIVE ANALYTICS FOR INTERNSHIP SUCCESS
	# ============================================================================

	class InternshipSuccessPredictor:
	"""
	ML model to predict internship application success
	"""
	def __init__(self):
	self.model = None
	self.scaler = StandardScaler()
	self.is_trained = False

	def extract_features(self, user_data: Dict, internship_data: Dict) -> np.ndarray:
	"""Extract features for prediction"""
	features = []

	# 1. Skills match score
	user_skills = user_data.get('skills', '').lower().split(',')
	required_skills = internship_data.get('skills_required', '').lower().split(',')
	user_skills = [s.strip() for s in user_skills if s.strip()]
	required_skills = [s.strip() for s in required_skills if s.strip()]

	match_result = enhanced_skill_matching(user_skills, required_skills)
	features.append(match_result['overall_score'])
	features.append(match_result['match_percentage'] / 100)

	# 2. Experience match
	years_required = internship_data.get('years_of_experience', 0)
	user_experience = user_data.get('experience', '')
	# Estimate years from experience text
	years_match = 1.0 if years_required == 0 else 0.5
	features.append(years_match)

	# 3. Education level
	education = user_data.get('education', '').lower()
	edu_score = 0.7
	if 'master' in education or 'phd' in education:
	edu_score = 1.0
	elif 'bachelor' in education or 'b.s' in education or 'b.e' in education:
	edu_score = 0.8
	features.append(edu_score)

	# 4. Certifications count
	certifications = user_data.get('certifications', '')
	cert_count = len([c for c in certifications.split(',') if c.strip()]) if certifications else 0
	features.append(min(1.0, cert_count / 3))

	# 5. Resume completeness
	required_fields = ['skills', 'experience', 'education', 'phone_number', 'email']
	completeness = sum(1 for f in required_fields if user_data.get(f)) / len(required_fields)
	features.append(completeness)

	# 6. Location match (same state/city)
	user_location = user_data.get('location', '').lower()
	job_location = internship_data.get('location', '').lower()
	location_match = 1.0 if user_location in job_location or job_location in user_location else 0.5
	features.append(location_match)

	return np.array(features).reshape(1, -1)

	def train(self, training_data: pd.DataFrame):
	"""Train the model with historical data"""
	if len(training_data) < 10:
	logging.warning("Insufficient training data for internship predictor")
	return False

	try:
	X = training_data.drop(['success'], axis=1)
	y = training_data['success']

	# Use XGBoost if available, otherwise RandomForest
	if _XGB_AVAILABLE and xgb:
	self.model = xgb.XGBClassifier(
	n_estimators=100,
	max_depth=5,
	learning_rate=0.1,
	random_state=42
	)
	else:
	self.model = RandomForestClassifier(
	n_estimators=100,
	max_depth=5,
	random_state=42
	)

	X_scaled = self.scaler.fit_transform(X)
	self.model.fit(X_scaled, y)
	self.is_trained = True
	logging.info("Internship success predictor trained successfully")
	return True
	except Exception as e:
	logging.error(f"Error training predictor: {str(e)}")
	return False

	def predict_success_probability(self, user_data: Dict, internship_data: Dict) -> Dict:
	"""Predict probability of internship application success"""
	features = self.extract_features(user_data, internship_data)

	if self.is_trained and self.model:
	try:
	features_scaled = self.scaler.transform(features)
	probability = self.model.predict_proba(features_scaled)[0][1]
	prediction = self.model.predict(features_scaled)[0]
	except:
	# Fallback to heuristic
	probability = features[0][0] * 0.4 + features[0][1] * 0.3 + features[0][4] * 0.3
	prediction = 1 if probability > 0.5 else 0
	else:
	# Heuristic-based prediction
	probability = features[0][0] * 0.4 + features[0][1] * 0.3 + features[0][4] * 0.3
	prediction = 1 if probability > 0.5 else 0

	confidence = "High" if probability > 0.7 or probability < 0.3 else "Medium"

	return {
	'success_probability': round(probability * 100, 1),
	'prediction': 'Likely' if prediction == 1 else 'Unlikely',
	'confidence': confidence,
	'recommendation': self._generate_recommendation(probability, features[0])
	}

	def _generate_recommendation(self, probability: float, features: np.ndarray) -> str:
	"""Generate personalized recommendation"""
	if probability > 0.7:
	return "Strong match! Apply with confidence."
	elif probability > 0.5:
	return "Good match. Consider highlighting relevant projects in your application."
	elif probability > 0.3:
	return "Moderate match. Improve skills alignment or consider skill development."
	else:
	return "Skills gap detected. Focus on building required skills before applying."


	# ============================================================================
	# 6. PERSONALIZED LEARNING RECOMMENDATIONS
	# ============================================================================

	def generate_learning_path(user_skills: List[str], target_skills: List[str],
	career_goal: str = None) -> Dict:
	"""
	Generate personalized learning recommendations
	"""
	missing_skills = list(set(target_skills) - set(user_skills))

	if not missing_skills:
	return {
	'status': 'complete',
	'message': 'You have all required skills!',
	'recommendations': []
	}

	# Categorize missing skills
	categorized = {
	'beginner': [],
	'intermediate': [],
	'advanced': []
	}

	for skill in missing_skills:
	skill_lower = skill.lower()
	# Simple heuristic for difficulty
	if any(x in skill_lower for x in ['basic', 'intro', 'fundamental']):
	categorized['beginner'].append(skill)
	elif any(x in skill_lower for x in ['advanced', 'expert', 'architect']):
	categorized['advanced'].append(skill)
	else:
	categorized['intermediate'].append(skill)

	# Generate course recommendations
	recommendations = []

	for skill in missing_skills[:5]: # Top 5 priorities
	courses = {
	'skill': skill,
	'resources': [
	{
	'platform': 'Coursera',
	'url': f'https://www.coursera.org/search?query={skill.replace(" ", "+")}',
	'type': 'Online Course'
	},
	{
	'platform': 'Udemy',
	'url': f'https://www.udemy.com/courses/search/?q={skill.replace(" ", "+")}',
	'type': 'Video Tutorial'
	},
	{
	'platform': 'YouTube',
	'url': f'https://www.youtube.com/results?search_query={skill.replace(" ", "+")}+tutorial',
	'type': 'Free Tutorial'
	},
	{
	'platform': 'Documentation',
	'url': f'https://www.google.com/search?q={skill.replace(" ", "+")}+official+documentation',
	'type': 'Official Docs'
	}
	],
	'estimated_time': '2-4 weeks',
	'priority': 'High' if skill in target_skills[:3] else 'Medium'
	}
	recommendations.append(courses)

	return {
	'status': 'learning_path_generated',
	'missing_skills_count': len(missing_skills),
	'categorized_skills': categorized,
	'recommendations': recommendations,
	'estimated_total_time': f'{len(missing_skills) * 3} weeks'
	}


	# ============================================================================
	# 7. CONTENT QUALITY ANALYSIS
	# ============================================================================

	def analyze_text_quality(text: str) -> Dict:
	"""
	Analyze text quality for resumes, cover letters, etc.
	"""
	if not text or len(text.strip()) < 10:
	return {'score': 0, 'issues': ['Text too short']}

	issues = []
	score = 100

	# 1. Grammar and spelling (basic checks)
	if text != text.strip():
	issues.append("Remove extra whitespace")
	score -= 5

	# 2. Readability
	try:
	flesch = textstat.flesch_reading_ease(text)
	if flesch < 30:
	issues.append("Text is too complex. Use simpler language.")
	score -= 10
	elif flesch > 90:
	issues.append("Text may be too simple. Add more detail.")
	score -= 5
	except:
	pass

	# 3. Length appropriateness
	word_count = len(text.split())
	if word_count < 50:
	issues.append("Add more content (aim for 100+ words)")
	score -= 15

	# 4. Professional tone
	informal_words = ['gonna', 'wanna', 'yeah', 'cool', 'awesome', 'stuff', 'things']
	found_informal = [w for w in informal_words if w in text.lower()]
	if found_informal:
	issues.append(f"Use professional language (avoid: {', '.join(found_informal)})")
	score -= 10

	# 5. Action verbs presence (for experience sections)
	action_verbs = ['developed', 'created', 'managed', 'led', 'implemented', 'designed']
	has_action_verbs = any(verb in text.lower() for verb in action_verbs)
	if not has_action_verbs and 'experience' in text.lower():
	issues.append("Use strong action verbs (developed, created, managed, etc.)")
	score -= 10

	return {
	'score': max(0, score),
	'grade': get_grade(score),
	'issues': issues if issues else ['Excellent quality!'],
	'word_count': word_count
	}


	# Global predictor instance
	predictor = InternshipSuccessPredictor()

	logging.info("ML utilities module loaded successfully")