Chaitanya-aitf commited on
Commit
546345d
·
verified ·
1 Parent(s): 7727077

Create prompts.py

Browse files
Files changed (1) hide show
  1. prompts.py +320 -0
prompts.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prompt templates for Gemini Flash 2.5 API interactions
3
+ Temperature settings and structured prompts for different analysis stages
4
+ """
5
+
6
+ CLAIM_EXTRACTION_PROMPT = """
7
+ You are an expert technical recruiter analyzing a CV for factual claims and credibility.
8
+ Temperature: 0.1 for precision
9
+
10
+ Section Type: {section_type}
11
+ Section Text: {section_text}
12
+ Seniority Level: {seniority_level}
13
+
14
+ Task: Extract ALL factual claims with extreme precision. Focus on:
15
+
16
+ 1. Work Experience Claims:
17
+ - Job titles, companies, dates, team sizes
18
+ - Specific responsibilities and technologies used
19
+ - Quantifiable achievements (metrics, percentages, timelines)
20
+ - Leadership/architectural claims
21
+
22
+ 2. Project Claims:
23
+ - Project names, descriptions, outcomes
24
+ - Technical stack used, deployment status
25
+ - Team role and contribution level
26
+ - Measurable results (users, performance gains, cost savings)
27
+
28
+ 3. Skills Claims (EXCLUDING EDUCATION):
29
+ - Programming languages with proficiency levels
30
+ - Frameworks, tools, platforms
31
+ - Certifications with dates
32
+ - Domain expertise claims
33
+
34
+ 4. Research/Publication Claims:
35
+ - Paper titles, conference venues, citations
36
+ - SOTA claims with specific metrics
37
+ - Patents, open-source contributions
38
+
39
+ Output JSON format:
40
+ {{
41
+ "claims": [
42
+ {{
43
+ "claim_id": "unique_id",
44
+ "claim_text": "exact text from CV",
45
+ "category": "work_experience|project|skill|research",
46
+ "subcategory": "specific_type",
47
+ "quantifiable_metrics": ["list of numbers/percentages/dates"],
48
+ "technologies_mentioned": ["tech1", "tech2"],
49
+ "time_period": {{
50
+ "start_date": "YYYY-MM or null",
51
+ "end_date": "YYYY-MM or null",
52
+ "duration_months": "number or null"
53
+ }},
54
+ "seniority_claim": "junior|mid|senior|lead|architect|none",
55
+ "verifiability_level": "high|medium|low",
56
+ "evidence_present": "direct|contextual|none",
57
+ "links_artifacts": ["URLs if any"],
58
+ "needs_clarification": ["specific points to verify"]
59
+ }}
60
+ ],
61
+ "metadata": {{
62
+ "total_claims": "number",
63
+ "buzzword_density": "0.0-1.0",
64
+ "specificity_score": "0.0-1.0"
65
+ }}
66
+ }}
67
+
68
+ IMPORTANT:
69
+ - Extract ONLY explicit claims, not inferences
70
+ - Mark vague claims ("worked on cutting-edge AI") with low verifiability
71
+ - Flag role-achievement mismatches for interview
72
+ - Note if metrics seem unrealistic for timeframe
73
+ - Skip education verification completely
74
+ """
75
+
76
+ EVIDENCE_VALIDATION_PROMPT = """
77
+ You are validating evidence for CV claims.
78
+ Temperature: 0.2 for balanced analysis
79
+
80
+ Claims to validate:
81
+ {claims_json}
82
+
83
+ Full CV text for cross-reference:
84
+ {full_cv_text}
85
+
86
+ For EACH claim, assess:
87
+
88
+ 1. Direct Evidence:
89
+ - Links to repositories, portfolios, demos (check if provided)
90
+ - Certificates, publications (with identifiers)
91
+ - Company/project websites mentioned
92
+
93
+ 2. Contextual Evidence:
94
+ - Technical depth in descriptions
95
+ - Specific tool versions, configurations
96
+ - Problem-solution narratives with details
97
+
98
+ 3. Cross-Section Validation:
99
+ - Skills mentioned MUST appear in at least one project/work
100
+ - Dates must be consistent across sections
101
+ - Technologies should align with timeframes (no React before 2013)
102
+
103
+ 4. Metric Sanity:
104
+ - Is "500% growth in 1 month" realistic?
105
+ - Do team sizes match achievement scope?
106
+ - Are research metrics within known SOTA bounds?
107
+
108
+ Output JSON:
109
+ {{
110
+ "validations": [
111
+ {{
112
+ "claim_id": "from_input",
113
+ "evidence_score": 0.0-1.0,
114
+ "evidence_type": "direct|contextual|cross_referenced|missing",
115
+ "supporting_sections": ["list of CV sections with evidence"],
116
+ "artifacts_found": [
117
+ {{
118
+ "type": "github|publication|certificate|website",
119
+ "url": "if_present",
120
+ "needs_verification": true/false
121
+ }}
122
+ ],
123
+ "cross_validation": {{
124
+ "skill_used_in_project": true/false,
125
+ "dates_consistent": true/false,
126
+ "tech_timeline_valid": true/false
127
+ }},
128
+ "metric_analysis": {{
129
+ "realistic": true/false,
130
+ "explanation": "why realistic or not"
131
+ }},
132
+ "triangulation_result": "verified|partial|unverified|red_flag"
133
+ }}
134
+ ],
135
+ "consistency_score": 0.0-1.0
136
+ }}
137
+ """
138
+
139
+ RED_FLAG_DETECTION_PROMPT = """
140
+ You are detecting credibility red flags in CV claims.
141
+ Temperature: 0.2 for pattern detection
142
+
143
+ Analyzed claims with validation:
144
+ {validated_claims_json}
145
+
146
+ Seniority Level: {seniority_level}
147
+ Role Type: {role_type}
148
+
149
+ Detect these RED FLAGS:
150
+
151
+ 1. Role-Achievement Mismatch:
152
+ - "Led/Architected" in junior roles or <6 month tenure
153
+ - Senior achievements with entry-level titles
154
+ - Sole credit for large team projects
155
+
156
+ 2. Timeline Issues:
157
+ - Overlapping full-time positions
158
+ - Technologies used before public release
159
+ - Impossibly short project durations for scope
160
+
161
+ 3. Metric Implausibility:
162
+ - Extreme percentages without context (500%+ improvements)
163
+ - SOTA claims exceeding published benchmarks
164
+ - Unrealistic user numbers or scale claims
165
+
166
+ 4. Vagueness Patterns:
167
+ - High buzzword density without specifics
168
+ - Missing metrics on all achievements
169
+ - No technical depth for "expert" claims
170
+
171
+ 5. Over-claiming:
172
+ - Too many "expert" level skills (>15)
173
+ - All projects "successful" with no challenges
174
+ - Pattern of superlatives without evidence
175
+
176
+ Output JSON:
177
+ {{
178
+ "red_flags": [
179
+ {{
180
+ "flag_id": "unique_id",
181
+ "severity": "critical|high|medium|low",
182
+ "category": "timeline|implausible|vague|overclaim|mismatch",
183
+ "affected_claims": ["claim_ids"],
184
+ "description": "specific issue",
185
+ "interview_probe": "suggested question to clarify",
186
+ "auto_reject": false,
187
+ "requires_proof": true/false
188
+ }}
189
+ ],
190
+ "credibility_score": 0-100,
191
+ "seniority_adjustment": "applied adjustment based on level",
192
+ "risk_assessment": "low|medium|high|critical"
193
+ }}
194
+ """
195
+
196
+ SOTA_VERIFICATION_PROMPT = """
197
+ You are verifying research and technical achievement claims against known benchmarks.
198
+ Temperature: 0.1 for factual accuracy
199
+
200
+ Research/Technical claims:
201
+ {research_claims_json}
202
+
203
+ Verify against known SOTA (State-of-the-Art) as of {current_date}:
204
+
205
+ For each claim:
206
+ 1. Identify the benchmark/dataset/metric
207
+ 2. Check if numbers exceed published SOTA
208
+ 3. Look for required context (dataset, conditions, hardware)
209
+ 4. Assess if improvement magnitude is plausible
210
+
211
+ Known SOTA baselines to reference:
212
+ - ImageNet accuracy: ~92% (2024)
213
+ - BERT-base F1 on SQUAD: ~93%
214
+ - GPT-3 perplexity: varies by dataset
215
+ - Object detection mAP: ~60-65% on COCO
216
+
217
+ Output JSON:
218
+ {{
219
+ "sota_validations": [
220
+ {{
221
+ "claim_id": "from_input",
222
+ "benchmark": "identified benchmark/dataset",
223
+ "claimed_metric": "number",
224
+ "known_sota": "published baseline",
225
+ "exceeds_sota": true/false,
226
+ "has_context": true/false,
227
+ "missing_details": ["dataset", "evaluation protocol", "hardware"],
228
+ "plausibility": "plausible|unlikely|impossible",
229
+ "verification_status": "needs_clarification|likely_valid|red_flag",
230
+ "interview_questions": ["specific technical questions"]
231
+ }}
232
+ ]
233
+ }}
234
+ """
235
+
236
+ REPOSITORY_ANALYSIS_PROMPT = """
237
+ Analyze repository evidence for verification.
238
+ Temperature: 0.1
239
+
240
+ Repository URL: {repo_url}
241
+ Repository metrics: {repo_metrics}
242
+ Claimed contributions: {claimed_contributions}
243
+
244
+ Assess:
245
+ 1. Commit density and authorship
246
+ 2. First commit vs claim date alignment
247
+ 3. README quality and documentation depth
248
+ 4. Issues/PRs linked to claimed features
249
+ 5. Code complexity matching claimed scope
250
+ 6. Dependencies matching claimed tech stack
251
+
252
+ Output credibility score and specific findings.
253
+ """
254
+
255
+ # Scoring calibration parameters
256
+ SCORING_CONFIG = {
257
+ "weights": {
258
+ "credibility": 0.6,
259
+ "consistency": 0.4
260
+ },
261
+ "seniority_thresholds": {
262
+ "senior": {
263
+ "min_evidence_score": 0.7,
264
+ "max_buzzword_density": 0.2,
265
+ "min_specificity": 0.8
266
+ },
267
+ "mid": {
268
+ "min_evidence_score": 0.5,
269
+ "max_buzzword_density": 0.3,
270
+ "min_specificity": 0.6
271
+ },
272
+ "junior": {
273
+ "min_evidence_score": 0.3,
274
+ "max_buzzword_density": 0.4,
275
+ "min_specificity": 0.4
276
+ },
277
+ "intern": {
278
+ "min_evidence_score": 0.2,
279
+ "max_buzzword_density": 0.5,
280
+ "min_specificity": 0.3
281
+ }
282
+ },
283
+ "evidence_tier_weights": {
284
+ "doi_arxiv": 1.0,
285
+ "github_active": 0.9,
286
+ "company_blog": 0.8,
287
+ "personal_blog": 0.6,
288
+ "no_artifact": 0.3
289
+ },
290
+ "red_flag_severity_scores": {
291
+ "critical": -30,
292
+ "high": -20,
293
+ "medium": -10,
294
+ "low": -5
295
+ }
296
+ }
297
+
298
+ # Bias mitigation configuration
299
+ FAIRNESS_CONFIG = {
300
+ "protected_attributes": [
301
+ "school", "university", "college", "age", "gender",
302
+ "nationality", "ethnicity", "religion", "marital status"
303
+ ],
304
+ "pii_patterns": {
305
+ "phone": r"\+?[\d\s\-\(\)]+",
306
+ "email": r"[\w\.-]+@[\w\.-]+\.\w+",
307
+ "address": r"\d+\s+[\w\s,]+\d{5}",
308
+ "ssn": r"\d{3}-\d{2}-\d{4}"
309
+ }
310
+ }
311
+
312
+ # Interview question templates
313
+ INTERVIEW_TEMPLATES = {
314
+ "unverified_claim": "You mentioned {claim}. Can you provide more details about {specific_aspect}?",
315
+ "metric_clarification": "You achieved {metric}. What was the baseline and methodology?",
316
+ "timeline_gap": "Can you walk me through your activities between {start} and {end}?",
317
+ "tech_depth": "You listed {technology} expertise. Can you describe a specific challenge you solved with it?",
318
+ "sole_credit": "You mentioned {achievement}. Who else was involved and what was your specific contribution?",
319
+ "sota_claim": "Your research shows {metric} performance. How does this compare to published baselines?"
320
+ }