File size: 16,657 Bytes
08080e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
import os
import re
import spacy
import numpy as np
import language_tool_python
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textstat import textstat
# ==========================================
# 1. ENVIRONMENT & MODEL SETUP
# ==========================================
# Ensure Java is accessible for LanguageTool (Backend fallback)
# Adjust this path if your server location is different
if os.path.exists("/usr/lib/jvm/java-17-openjdk-amd64"):
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
print("Loading models... this may take a moment.")
# Load Spacy (with auto-download fallback)
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Spacy model not found. Downloading...")
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Load AI Models
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
grammar_tool = language_tool_python.LanguageTool('en-US')
sentiment_analyzer = SentimentIntensityAnalyzer()
# CrossEncoder for potentially deeper NLI tasks (loaded for future-proofing/robustness)
nli_model = CrossEncoder('cross-encoder/stsb-distilroberta-base')
print("Models loaded successfully.")
# ==========================================
# 2. SCORING RUBRIC CONFIGURATION
# ==========================================
RUBRIC = {
"salutation": {
"normal": ["hi", "hello"],
"good": ["good morning", "good afternoon", "good evening", "good day", "hello everyone"],
"excellent": ["excited to introduce", "feeling great", "pleasure to introduce", "greetings"]
},
"content": {
"must_have": {
"points": 4,
"topics": ["Name", "Age", "School/Class", "Family", "Hobbies/Interests"]
},
"good_to_have": {
"points": 2,
"topics": ["Origin/Location", "Ambition/Goal", "Fun Fact/Unique", "Strengths", "Achievements"]
}
},
"speech_rate": {
"fast_threshold": 160,
"ideal_min": 111,
"ideal_max": 140,
"slow_threshold": 80
},
"fillers": ["um", "uh", "like", "you know", "actually", "basically", "right", "i mean", "well", "kinda", "sort of", "hmm"]
}
# ==========================================
# 3. MAIN LOGIC CLASS
# ==========================================
class IntroductionScorer:
def __init__(self, transcript_text, audio_duration_sec=None):
self.text = transcript_text
self.doc = nlp(transcript_text)
self.provided_duration = float(audio_duration_sec) if audio_duration_sec else 0
self.duration_min = (self.provided_duration / 60) if self.provided_duration else 0
self.sentences = [sent.text.strip() for sent in self.doc.sents]
self.words = [token.text.lower() for token in self.doc if not token.is_punct]
self.total_words = len(self.words)
def score_salutation(self):
text_lower = self.text.lower()
for phrase in RUBRIC["salutation"]["excellent"]:
if phrase in text_lower:
return 5, f"Excellent salutation used: '{phrase}'"
for phrase in RUBRIC["salutation"]["good"]:
if phrase in text_lower:
return 4, f"Good salutation used: '{phrase}'"
for word in RUBRIC["salutation"]["normal"]:
if word in text_lower:
return 2, "Basic salutation used (Hi/Hello). Try to be more formal."
return 0, "No salutation found."
def score_content(self):
scores = 0
feedback = []
# --- Regex Checks for Specific Facts ---
regex_name = r"\b(name\s+is|i\s+am|i[\s'’]*m|myself|this\s+is)\s+([A-Z])"
regex_age = r"\b(\d+|thirteen|fourteen|fifteen|sixteen)\s*(-)?\s*(years|yrs)\b"
regex_school = r"\b(class|grade|standard|school|college|university|study|student)\b"
if re.search(regex_name, self.text, re.IGNORECASE):
scores += 4; feedback.append("[+] Name")
else: feedback.append("[-] Name")
if re.search(regex_age, self.text, re.IGNORECASE):
scores += 4; feedback.append("[+] Age")
else: feedback.append("[-] Age")
if re.search(regex_school, self.text, re.IGNORECASE):
scores += 4; feedback.append("[+] School")
else: feedback.append("[-] School")
# --- Robust Semantic Checks (Regex + Embeddings) ---
def check_topic_robust(regex, anchors, use_ai=True):
# 1. Fast Regex Check
if re.search(regex, self.text, re.IGNORECASE): return True
# 2. Deep Semantic Check
if use_ai and self.sentences:
topic_emb = sbert_model.encode(anchors, convert_to_tensor=True)
text_emb = sbert_model.encode(self.sentences, convert_to_tensor=True)
# Find max similarity between any sentence and topic anchors
best_score = float(util.cos_sim(text_emb, topic_emb).max())
return best_score > 0.35
return False
# Family Check
if check_topic_robust(r"\b(family|parents|mother|father|siblings)\b", ["My family", "I live with"]):
scores += 4; feedback.append("[+] Family")
else: feedback.append("[-] Family")
# Hobbies Check
if check_topic_robust(r"\b(hobby|hobbies|enjoy|like\s+(to|playing|reading)|pastime)\b", ["My hobby is", "I enjoy"]):
scores += 4; feedback.append("[+] Hobbies")
else: feedback.append("[-] Hobbies")
# --- Bonus Checks ---
bonuses = {
"Ambition": (r"\b(goal|ambition|dream|want\s+to\s+be)\b", ["I want to become"], True),
"Strength": (r"\b(strength|good\s+at|confident)\b", ["My strength is"], True),
"Unique": (r"\b(unique|special|fun\s+fact)\b", ["fun fact"], True),
"Origin": (r"\b(i\s+am\s+from|i['’]m\s+from|originally\s+from|live\s+in|living\s+in|born\s+in|hometown|native)\b", [], False),
"Achievements": (r"\b(won|achievement|award)\b", ["I won"], True)
}
for topic, (reg, anc, use_ai_flag) in bonuses.items():
if check_topic_robust(reg, anc, use_ai=use_ai_flag):
scores += 2; feedback.append(f"[+] {topic}")
return min(30, scores), ", ".join(feedback)
def score_flow(self):
anchors = {
"salutation": ["Hello everyone", "Good morning", "Hi", "Greetings"],
"intro": ["My name is", "I am", "I'm", "I’m", "Myself", "This is"],
"closing": ["Thank you", "Thanks", "That is all", "The end"],
"body": ["family", "mother", "school", "class", "hobby", "playing", "dream", "goal"]
}
if not self.sentences: return 0, "No text"
text_emb = sbert_model.encode(self.sentences, convert_to_tensor=True)
def get_idx(key, thresh=0.25):
anc = sbert_model.encode(anchors[key], convert_to_tensor=True)
sims = util.cos_sim(text_emb, anc).max(dim=1).values
best_idx = int(sims.argmax())
best_score = float(sims.max())
return best_idx, best_score > thresh
idx_s, has_s = get_idx("salutation", 0.25)
idx_i, has_i = get_idx("intro", 0.25)
idx_c, has_c = get_idx("closing", 0.30)
# Check if there is "meat" between intro and closing
has_body = False
if has_i and has_c and idx_c > idx_i:
if idx_c - idx_i >= 1:
mid_sents = self.sentences[idx_i+1 : idx_c]
if mid_sents:
mid_emb = sbert_model.encode(mid_sents, convert_to_tensor=True)
bod_emb = sbert_model.encode(anchors["body"], convert_to_tensor=True)
if util.cos_sim(mid_emb, bod_emb).max() > 0.25: has_body = True
debug_info = f"(Indices: Sal={idx_s if has_s else 'X'}, Intro={idx_i if has_i else 'X'}, End={idx_c if has_c else 'X'})"
if has_s and has_c:
if has_i:
if idx_s <= idx_i < idx_c:
return (5, "Perfect Flow") if has_body else (5, "Good Flow (Short body)")
if idx_i == idx_c:
return 0, f"Disordered: Introduction and Closing are detected in same sentence. {debug_info}"
elif idx_s < idx_c:
return (5, "Good Flow") if has_body else (5, "Acceptable Flow")
return 0, f"Flow disordered. {debug_info}"
def score_speech_rate(self):
if not self.provided_duration:
return 10, "Duration not provided (Assumed Ideal)"
wpm = self.total_words / self.duration_min if self.duration_min > 0 else 0
if 111 <= wpm <= 140: return 10, f"Ideal ({int(wpm)} WPM)"
if 81 <= wpm <= 160: return 6, f"Acceptable ({int(wpm)} WPM)"
if wpm > 140: return 2, f"Too Fast ({int(wpm)} WPM)"
if wpm < 81: return 2, f"Too Slow ({int(wpm)} WPM)"
return 2, f"Poor Pacing ({int(wpm)} WPM)"
def score_grammar(self):
try:
matches = grammar_tool.check(self.text)
scoring_errors = []
ignored_issues = []
# --- Intelligent Filtering of Errors ---
for m in matches:
rid = getattr(m, 'ruleId', '').upper()
msg = getattr(m, 'message', '').lower()
replacements = getattr(m, 'replacements', [])
offset = getattr(m, 'offset', 0)
length = getattr(m, 'errorLength', getattr(m, 'length', 5))
error_text = self.text[offset : offset + length]
is_ignored = False
# Ignore hyphenation suggestions if only one hyphen is missing
if replacements:
top_rep = replacements[0]
if "-" in top_rep and top_rep.replace("-", "") == error_text.replace(" ", ""):
is_ignored = True
# Ignore stylistic choices often flagged by strict grammar tools
ignore_keywords = [
"hyphen", "compound", "joined", "whitespace", "comma", "punctuation",
"spelling", "typo", "morfologik", "uppercase", "capitalization",
"repetition", "consecutive", "successive", "same word",
"style", "wordiness", "sentence start", "rewording", "thesaurus"
]
if any(k in msg or k in rid.lower() for k in ignore_keywords):
is_ignored = True
if is_ignored: ignored_issues.append(m)
else: scoring_errors.append(m)
# --- Scoring Calculation ---
err_count = len(scoring_errors)
errors_per_100 = (err_count / self.total_words) * 100 if self.total_words > 0 else 0
# Conservative penalty
grammar_metric = 1 - min(errors_per_100 / 5, 1)
if grammar_metric > 0.9: s=10; g="Flawless"
elif grammar_metric >= 0.7: s=8; g="Good"
elif grammar_metric >= 0.5: s=6; g="Average"
elif grammar_metric >= 0.3: s=4; g="Needs Improvement"
else: s=2; g="Poor"
# --- Feedback Formatting ---
fb_lines = []
fb_lines.append(f"{g} (Score: {s}/10)")
fb_lines.append("NOTE: Spelling, hyphens, punctuation, and style ignored.")
if scoring_errors:
fb_lines.append(f"\n[CRITICAL GRAMMAR ERRORS] ({len(scoring_errors)} found):")
for m in scoring_errors[:3]: # Limit to top 3
off = getattr(m, 'offset', 0)
ln = getattr(m, 'errorLength', getattr(m, 'length', 5))
ctx = self.text[off : off+ln+10].replace('\n', ' ')
fb_lines.append(f" - {m.message} (Context: '...{ctx}...')")
else:
fb_lines.append("\n[CRITICAL GRAMMAR ERRORS]: None.")
if ignored_issues:
fb_lines.append(f"\n[IGNORED ISSUES] ({len(ignored_issues)} found):")
for m in ignored_issues[:3]:
msg = getattr(m, 'message', 'Issue')
off = getattr(m, 'offset', 0)
ln = getattr(m, 'errorLength', getattr(m, 'length', 5))
ctx = self.text[off : off+ln+10].replace('\n', ' ')
fb_lines.append(f" - {msg} (Context: '...{ctx}...')")
return s, "\n".join(fb_lines)
except Exception as e:
return 5, f"Error during grammar check: {str(e)}"
def score_vocabulary(self):
distinct_words = len(set(self.words))
ttr = distinct_words / self.total_words if self.total_words > 0 else 0
if ttr >= 0.9: return 10, f"Excellent variety (TTR: {ttr:.2f})"
elif ttr >= 0.7: return 8, f"Good variety (TTR: {ttr:.2f})"
elif ttr >= 0.5: return 6, f"Average variety (TTR: {ttr:.2f})"
elif ttr >= 0.3: return 4, f"Repetitive (TTR: {ttr:.2f})"
else: return 2, f"Very repetitive (TTR: {ttr:.2f})"
def score_clarity(self):
filler_count = 0
for word in self.words:
if word in RUBRIC["fillers"]:
filler_count += 1
filler_rate = (filler_count / self.total_words) * 100 if self.total_words > 0 else 0
if filler_rate <= 3: return 15, f"Clear speech ({filler_count} fillers)"
elif filler_rate <= 6: return 12, f"Mostly clear ({filler_count} fillers)"
elif filler_rate <= 9: return 9, f"Some hesitation ({filler_count} fillers)"
elif filler_rate <= 12: return 6, f"Hesitant ({filler_count} fillers)"
else: return 3, f"Distracted by fillers ({filler_count} fillers)"
def score_engagement(self):
vs = sentiment_analyzer.polarity_scores(self.text)
# Normalize compound score (-1 to 1) to (0 to 1)
prob = (vs['compound'] + 1) / 2
high_energy_kws = [
"excited", "thrilled", "passionate", "delighted", "honor",
"love", "amazing", "wonderful", "fantastic", "energetic",
"grateful", "confident", "pleasure"
]
has_enthusiasm = any(w in self.text.lower() for w in high_energy_kws)
# Cap sentiment if it's high but lacks enthusiastic vocabulary
if prob >= 0.9 and not has_enthusiasm:
prob = 0.88
if prob >= 0.9:
return 15, f"Very Engaging (Sentiment: {prob:.2f})"
elif prob >= 0.7:
return 12, f"Positive (Sentiment: {prob:.2f})"
elif prob >= 0.5:
return 9, f"Neutral (Sentiment: {prob:.2f})"
elif prob >= 0.3:
return 6, f"Slightly Negative (Sentiment: {prob:.2f})"
else:
return 3, f"Negative (Sentiment: {prob:.2f})"
def calculate_overall_score(self):
s_salutation, f_salutation = self.score_salutation()
s_content, f_content = self.score_content()
s_flow, f_flow = self.score_flow()
s_rate, f_rate = self.score_speech_rate()
s_grammar, f_grammar = self.score_grammar()
s_vocab, f_vocab = self.score_vocabulary()
s_clarity, f_clarity = self.score_clarity()
s_engage, f_engage = self.score_engagement()
total_score = (
s_salutation + s_content + s_flow + s_rate +
s_grammar + s_vocab + s_clarity + s_engage
)
return {
"Total Score": total_score,
"Breakdown": {
"Salutation": {"score": s_salutation, "max": 5, "feedback": f_salutation},
"Content & Structure": {"score": s_content, "max": 30, "feedback": f_content},
"Flow": {"score": s_flow, "max": 5, "feedback": f_flow},
"Speech Rate": {"score": s_rate, "max": 10, "feedback": f_rate},
"Grammar": {"score": s_grammar, "max": 10, "feedback": f_grammar},
"Vocabulary": {"score": s_vocab, "max": 10, "feedback": f_vocab},
"Clarity (Fillers)": {"score": s_clarity, "max": 15, "feedback": f_clarity},
"Engagement": {"score": s_engage, "max": 15, "feedback": f_engage},
}
} |