Parthnuwal7 commited on
Commit
3d015cd
·
0 Parent(s):

Adding analytical content

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment
2
+ .env
3
+ .env.example
4
+ .env.local
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+ env/
10
+
11
+ # Python cache
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+ *.pyo
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Cached centroids
27
+ *.npz
28
+ aspect_centroids.npz
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements first for caching
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy application code
15
+ COPY . .
16
+
17
+ # Create directories for caching
18
+ RUN mkdir -p /app/cache
19
+
20
+ # HuggingFace Spaces uses port 7860
21
+ ENV PORT=7860
22
+ ENV PYTHONUNBUFFERED=1
23
+
24
+ # Expose the port
25
+ EXPOSE 7860
26
+
27
+ # Run the application
28
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FCT
3
+ emoji: 🦀
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ license: other
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main Flask application for Analytics Module"""
2
+ from flask import Flask
3
+ from flask_cors import CORS
4
+ from config import Config
5
+
6
+ # Initialize Flask app
7
+ app = Flask(__name__)
8
+ app.config.from_object(Config)
9
+ CORS(app)
10
+
11
+ # Register blueprints
12
+ from routes.students import students_bp
13
+ from routes.scoring import scoring_bp
14
+ from routes.domain import domain_bp
15
+
16
+ app.register_blueprint(students_bp, url_prefix='/api/analytics')
17
+ app.register_blueprint(scoring_bp, url_prefix='/api/analytics')
18
+ app.register_blueprint(domain_bp, url_prefix='/api/analytics')
19
+
20
+ # Health check
21
+ @app.route('/health', methods=['GET'])
22
+ def health_check():
23
+ return {'status': 'healthy', 'service': 'analytics-api'}, 200
24
+
25
+ @app.route('/', methods=['GET'])
26
+ def home():
27
+ return {
28
+ 'service': 'Student Profiling & Employability Scoring API',
29
+ 'version': '1.0.0',
30
+ 'endpoints': {
31
+ 'students': '/api/analytics/students',
32
+ 'personality': '/api/analytics/personality/<student_id>',
33
+ 'text': '/api/analytics/text/<student_id>',
34
+ 'score': '/api/analytics/score/<student_id>',
35
+ 'leaderboard': '/api/analytics/leaderboard',
36
+ 'domain': {
37
+ 'available': '/api/analytics/domain/available',
38
+ 'submit': '/api/analytics/students/<student_id>/domain-evidence',
39
+ 'get': '/api/analytics/students/<student_id>/domain-evidence',
40
+ 'delete': '/api/analytics/students/<student_id>/domain-evidence/<domain_type>'
41
+ }
42
+ }
43
+ }
44
+
45
+ if __name__ == '__main__':
46
+ import os
47
+ port = int(os.getenv('PORT', 7860)) # HuggingFace uses 7860
48
+ debug = os.getenv('DEBUG', 'False').lower() == 'true'
49
+
50
+ print(f"🚀 Analytics API starting on port {port}")
51
+ print(f"📊 Scoring modules: Universal, Personality, Text, Domain (Tech/Business/Creative/Research)")
52
+ print(f"🔗 Base URL: http://0.0.0.0:{port}")
53
+ app.run(host='0.0.0.0', port=port, debug=debug)
aspect_seeds.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "leadership": [
3
+ "led a team",
4
+ "was team lead",
5
+ "managed a project",
6
+ "supervised interns",
7
+ "coordinated a cross-functional team",
8
+ "organized the club",
9
+ "president of the society",
10
+ "captain of the team",
11
+ "ran weekly standups",
12
+ "delegated tasks",
13
+ "mentored junior members",
14
+ "headed the project",
15
+ "oversaw project timelines",
16
+ "chaired the committee",
17
+ "led end-to-end delivery",
18
+ "directed project milestones",
19
+ "led a 5-person team",
20
+ "managed stakeholders",
21
+ "took ownership of the initiative",
22
+ "led code reviews",
23
+ "organized campus events",
24
+ "led product demo sessions",
25
+ "led recruitment for volunteers",
26
+ "managed vendor relationships",
27
+ "spearheaded the outreach program"
28
+ ],
29
+ "technical_skills": [
30
+ "developed a web API",
31
+ "implemented RESTful services",
32
+ "coded in python",
33
+ "built machine learning models",
34
+ "trained neural networks",
35
+ "implemented data pipelines",
36
+ "used pandas for ETL",
37
+ "designed database schemas",
38
+ "built microservices",
39
+ "deployed models using docker",
40
+ "worked with FastAPI",
41
+ "implemented CI/CD",
42
+ "wrote unit tests",
43
+ "optimized SQL queries",
44
+ "used scikit-learn",
45
+ "developed recommendation systems",
46
+ "built feature engineering pipelines",
47
+ "deployed to cloud",
48
+ "developed ETL jobs",
49
+ "worked with Kafka",
50
+ "implemented caching layers",
51
+ "used TensorFlow or PyTorch",
52
+ "built backend services",
53
+ "wrote production-grade code",
54
+ "integrated third-party APIs"
55
+ ],
56
+ "problem_solving": [
57
+ "solved complex problem",
58
+ "debugged production issues",
59
+ "optimized an algorithm",
60
+ "reduced latency of service",
61
+ "designed a scalable solution",
62
+ "investigated root cause",
63
+ "improved system reliability",
64
+ "created a novel solution",
65
+ "troubleshot integration issues",
66
+ "automated manual tasks",
67
+ "reduced memory usage",
68
+ "resolved data pipeline failures",
69
+ "refactored critical code",
70
+ "handled edge cases",
71
+ "iterated on prototypes",
72
+ "performed A/B testing to decide",
73
+ "diagnosed performance bottlenecks",
74
+ "designed fallback strategies",
75
+ "resolved deployment failures",
76
+ "created monitoring & alerts"
77
+ ],
78
+ "internships_experience": [
79
+ "summer internship",
80
+ "industrial training",
81
+ "interned at",
82
+ "worked as an intern",
83
+ "internship project",
84
+ "internship in data science",
85
+ "interned at a startup",
86
+ "completed internship at",
87
+ "interned with the engineering team",
88
+ "intern experience",
89
+ "interned at an e-commerce company",
90
+ "industrial internship",
91
+ "co-op placement",
92
+ "paid internship",
93
+ "research internship",
94
+ "interned as a software engineer",
95
+ "on-the-job training",
96
+ "worked under mentor",
97
+ "internship-driven project",
98
+ "corporate internship"
99
+ ],
100
+ "communication": [
101
+ "presented to stakeholders",
102
+ "gave a presentation",
103
+ "wrote documentation",
104
+ "authored reports",
105
+ "explained results to non-technical",
106
+ "public speaking",
107
+ "delivered demo",
108
+ "prepared slides",
109
+ "wrote user guides",
110
+ "communicated with clients",
111
+ "collaborated across teams",
112
+ "conducted knowledge transfer",
113
+ "wrote clear emails",
114
+ "explained technical concepts",
115
+ "presented project outcomes",
116
+ "led demo sessions",
117
+ "created onboarding docs",
118
+ "contributed to team discussions",
119
+ "led workshops",
120
+ "hosted training sessions"
121
+ ],
122
+ "teamwork": [
123
+ "collaborated with team",
124
+ "worked in a cross-functional team",
125
+ "paired programming",
126
+ "contributed to group project",
127
+ "supported teammates",
128
+ "collaborated on design",
129
+ "worked with designers and PMs",
130
+ "helped teammates debug",
131
+ "co-authored project",
132
+ "mentored peers",
133
+ "shared responsibilities",
134
+ "worked effectively in group",
135
+ "contributed in agile team",
136
+ "participated in sprints",
137
+ "assisted in integration"
138
+ ],
139
+ "project_execution": [
140
+ "delivered project on time",
141
+ "met project deadlines",
142
+ "managed milestones",
143
+ "handled project planning",
144
+ "released production features",
145
+ "coordinated deployment",
146
+ "delivered MVP",
147
+ "tracked KPIs",
148
+ "managed scope",
149
+ "created project timeline",
150
+ "ran retrospectives",
151
+ "managed feature rollout",
152
+ "ensured on-time delivery",
153
+ "performed release validations",
154
+ "deployed analytics dashboard",
155
+ "iterated based on feedback"
156
+ ],
157
+ "initiative": [
158
+ "initiated a project",
159
+ "proposed a new idea",
160
+ "took initiative",
161
+ "started a side project",
162
+ "built a proof of concept",
163
+ "started a campus chapter",
164
+ "created an automation",
165
+ "improved an existing process",
166
+ "volunteered to lead",
167
+ "identified improvement areas",
168
+ "launched a mini-product",
169
+ "ran a pilot program",
170
+ "created onboarding scripts",
171
+ "led process improvements",
172
+ "started a mentoring circle"
173
+ ],
174
+ "learning_agility": [
175
+ "quick learner",
176
+ "self-taught",
177
+ "learned new framework",
178
+ "picked up new language",
179
+ "adapted to new tech",
180
+ "completed online courses",
181
+ "upskilled via projects",
182
+ "transitioned domains",
183
+ "learned on the job",
184
+ "rapidly onboarded",
185
+ "attended workshops",
186
+ "completed bootcamp",
187
+ "took certification courses",
188
+ "learned through documentation",
189
+ "scaled knowledge quickly",
190
+ "adapted to changing scope"
191
+ ],
192
+ "career_alignment": [
193
+ "career goal is",
194
+ "aspire to become",
195
+ "interested in data science",
196
+ "pursue a role in product",
197
+ "long-term goal",
198
+ "want to specialize in",
199
+ "career objective",
200
+ "planning to pursue masters",
201
+ "aim to work in industry",
202
+ "seek product management roles",
203
+ "interested in research",
204
+ "want to join a startup",
205
+ "targeting roles in ML engineering",
206
+ "aiming for consulting roles",
207
+ "career path is focused on"
208
+ ]
209
+ }
config.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration for Analytics Module"""
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ class Config:
8
+ # Supabase
9
+ SUPABASE_URL = os.getenv('SUPABASE_URL', 'https://hbesjuifeodgqrptpkch.supabase.co')
10
+ SUPABASE_KEY = os.getenv('SUPABASE_KEY', '')
11
+
12
+ # ML Models
13
+ SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2'
14
+
15
+ # Scoring Weights
16
+ UNIVERSAL_WEIGHT = 0.50
17
+ PERSONALITY_WEIGHT = 0.25
18
+ TEXT_WEIGHT = 0.25
19
+
20
+ # Flask
21
+ DEBUG = os.getenv('DEBUG', 'True') == 'True'
22
+ PORT = int(os.getenv('PORT', 5001))
database/add_semester_columns.sql ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ -- Add missing semester columns to analytics_students table
2
+ -- Run this in Supabase SQL Editor
3
+
4
+ ALTER TABLE analytics_students
5
+ ADD COLUMN IF NOT EXISTS sgpa_sem1 REAL CHECK (sgpa_sem1 >= 0 AND sgpa_sem1 <= 10),
6
+ ADD COLUMN IF NOT EXISTS sgpa_sem2 REAL CHECK (sgpa_sem2 >= 0 AND sgpa_sem2 <= 10),
7
+ ADD COLUMN IF NOT EXISTS sgpa_sem3 REAL CHECK (sgpa_sem3 >= 0 AND sgpa_sem3 <= 10),
8
+ ADD COLUMN IF NOT EXISTS sgpa_sem7 REAL CHECK (sgpa_sem7 >= 0 AND sgpa_sem7 <= 10),
9
+ ADD COLUMN IF NOT EXISTS sgpa_sem8 REAL CHECK (sgpa_sem8 >= 0 AND sgpa_sem8 <= 10);
database/db.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Database connection and utilities"""
2
+ from supabase import create_client, Client
3
+ from config import Config
4
+
5
+ # Initialize Supabase client
6
+ supabase: Client = create_client(Config.SUPABASE_URL, Config.SUPABASE_KEY)
7
+
8
+ def get_db():
9
+ """Get Supabase client instance"""
10
+ return supabase
database/migrate_domain_module.sql ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Domain Module Migration Script
2
+ -- Run this in Supabase SQL Editor to add domain-specific scoring support
3
+ -- Date: December 9, 2025
4
+
5
+ -- ============================================================================
6
+ -- STEP 1: Add domain fields to existing analytics_students table
7
+ -- ============================================================================
8
+
9
+ ALTER TABLE analytics_students
10
+ ADD COLUMN IF NOT EXISTS active_domain TEXT CHECK (active_domain IN ('tech', 'business', 'creative', 'research', NULL)),
11
+ ADD COLUMN IF NOT EXISTS domain_score REAL CHECK (domain_score >= 0 AND domain_score <= 1),
12
+ ADD COLUMN IF NOT EXISTS domain_confidence REAL CHECK (domain_confidence >= 0 AND domain_confidence <= 1);
13
+
14
+ -- ============================================================================
15
+ -- STEP 2: Create domain evidence table
16
+ -- ============================================================================
17
+
18
+ CREATE TABLE IF NOT EXISTS analytics_domain_evidence (
19
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
20
+ student_id TEXT REFERENCES analytics_students(student_id) ON DELETE CASCADE,
21
+ domain_type TEXT NOT NULL CHECK (domain_type IN ('tech', 'business', 'creative', 'research')),
22
+ evidence_data JSONB NOT NULL,
23
+ domain_score REAL CHECK (domain_score >= 0 AND domain_score <= 1),
24
+ domain_confidence REAL CHECK (domain_confidence >= 0 AND domain_confidence <= 1),
25
+ raw_features JSONB,
26
+ processing_status TEXT DEFAULT 'pending' CHECK (processing_status IN ('pending', 'processing', 'completed', 'failed')),
27
+ error_message TEXT,
28
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
29
+ updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
30
+ UNIQUE(student_id, domain_type)
31
+ );
32
+
33
+ -- ============================================================================
34
+ -- STEP 3: Create indexes for performance
35
+ -- ============================================================================
36
+
37
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_student ON analytics_domain_evidence(student_id);
38
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_type ON analytics_domain_evidence(domain_type);
39
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_status ON analytics_domain_evidence(processing_status);
40
+
41
+ -- ============================================================================
42
+ -- STEP 4: Enable Row Level Security
43
+ -- ============================================================================
44
+
45
+ ALTER TABLE analytics_domain_evidence ENABLE ROW LEVEL SECURITY;
46
+
47
+ -- ============================================================================
48
+ -- STEP 5: Create RLS Policies for domain evidence
49
+ -- ============================================================================
50
+
51
+ -- Users can view their own domain evidence
52
+ CREATE POLICY "Users can view own domain evidence"
53
+ ON analytics_domain_evidence FOR SELECT
54
+ TO authenticated
55
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
56
+
57
+ -- Users can insert their own domain evidence
58
+ CREATE POLICY "Users can insert own domain evidence"
59
+ ON analytics_domain_evidence FOR INSERT
60
+ TO authenticated
61
+ WITH CHECK (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
62
+
63
+ -- Users can update their own domain evidence
64
+ CREATE POLICY "Users can update own domain evidence"
65
+ ON analytics_domain_evidence FOR UPDATE
66
+ TO authenticated
67
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
68
+
69
+ -- Users can delete their own domain evidence
70
+ CREATE POLICY "Users can delete own domain evidence"
71
+ ON analytics_domain_evidence FOR DELETE
72
+ TO authenticated
73
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
74
+
75
+ -- ============================================================================
76
+ -- STEP 6: Verification queries (run these to verify successful migration)
77
+ -- ============================================================================
78
+
79
+ -- Check if columns were added
80
+ SELECT column_name, data_type
81
+ FROM information_schema.columns
82
+ WHERE table_name = 'analytics_students'
83
+ AND column_name IN ('active_domain', 'domain_score', 'domain_confidence');
84
+
85
+ -- Check if table was created
86
+ SELECT table_name
87
+ FROM information_schema.tables
88
+ WHERE table_name = 'analytics_domain_evidence';
89
+
90
+ -- Check if indexes were created
91
+ SELECT indexname
92
+ FROM pg_indexes
93
+ WHERE tablename = 'analytics_domain_evidence';
94
+
95
+ -- Check if RLS policies were created
96
+ SELECT policyname
97
+ FROM pg_policies
98
+ WHERE tablename = 'analytics_domain_evidence';
99
+
100
+ -- ============================================================================
101
+ -- Migration Complete!
102
+ -- ============================================================================
103
+
104
+ -- Expected results:
105
+ -- ✓ 3 new columns in analytics_students table
106
+ -- ✓ 1 new table: analytics_domain_evidence
107
+ -- ✓ 3 new indexes
108
+ -- ✓ 4 new RLS policies
109
+
110
+ -- Next steps:
111
+ -- 1. Restart your Flask backend: python app.py
112
+ -- 2. Test domain submission via API or frontend form
113
+ -- 3. Verify score fusion includes domain component
database/migrate_to_text_fields.sql ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration: Replace numeric fields with text fields for extracurricular, certifications, and internships
2
+ -- Run this in Supabase SQL Editor
3
+
4
+ -- Add new text columns
5
+ ALTER TABLE analytics_students
6
+ ADD COLUMN IF NOT EXISTS extracurricular_text TEXT,
7
+ ADD COLUMN IF NOT EXISTS certifications_text TEXT,
8
+ ADD COLUMN IF NOT EXISTS internship_text TEXT;
9
+
10
+ -- Optional: Drop old numeric columns if you want to clean up
11
+ -- Uncomment these lines after verifying the new text fields work
12
+ -- ALTER TABLE analytics_students DROP COLUMN IF EXISTS extracurricular_count;
13
+ -- ALTER TABLE analytics_students DROP COLUMN IF EXISTS certifications_count;
14
+ -- ALTER TABLE analytics_students DROP COLUMN IF EXISTS internship_total_months;
15
+
16
+ -- Note: If you want to keep both old and new columns during transition,
17
+ -- you can skip dropping the old columns and they will coexist.
database/schema.sql ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Analytics Module Schema for Supabase
2
+ -- Run this in Supabase SQL Editor
3
+
4
+ -- 1. Students Table
5
+ CREATE TABLE IF NOT EXISTS analytics_students (
6
+ student_id TEXT PRIMARY KEY,
7
+ user_id UUID REFERENCES auth.users(id) ON DELETE CASCADE,
8
+ cgpa REAL NOT NULL CHECK (cgpa >= 0 AND cgpa <= 10),
9
+ sgpa_sem1 REAL CHECK (sgpa_sem1 >= 0 AND sgpa_sem1 <= 10),
10
+ sgpa_sem2 REAL CHECK (sgpa_sem2 >= 0 AND sgpa_sem2 <= 10),
11
+ sgpa_sem3 REAL CHECK (sgpa_sem3 >= 0 AND sgpa_sem3 <= 10),
12
+ sgpa_sem4 REAL CHECK (sgpa_sem4 >= 0 AND sgpa_sem4 <= 10),
13
+ sgpa_sem5 REAL CHECK (sgpa_sem5 >= 0 AND sgpa_sem5 <= 10),
14
+ sgpa_sem6 REAL CHECK (sgpa_sem6 >= 0 AND sgpa_sem6 <= 10),
15
+ sgpa_sem7 REAL CHECK (sgpa_sem7 >= 0 AND sgpa_sem7 <= 10),
16
+ sgpa_sem8 REAL CHECK (sgpa_sem8 >= 0 AND sgpa_sem8 <= 10),
17
+ tenth_pct REAL CHECK (tenth_pct >= 0 AND tenth_pct <= 100),
18
+ twelfth_pct REAL CHECK (twelfth_pct >= 0 AND twelfth_pct <= 100),
19
+ extracurricular_text TEXT,
20
+ certifications_text TEXT,
21
+ internship_text TEXT,
22
+ active_domain TEXT CHECK (active_domain IN ('tech', 'business', 'creative', 'research', NULL)),
23
+ domain_score REAL CHECK (domain_score >= 0 AND domain_score <= 1),
24
+ domain_confidence REAL CHECK (domain_confidence >= 0 AND domain_confidence <= 1),
25
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
26
+ updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
27
+ );
28
+
29
+ -- 2. Personality Responses Table
30
+ CREATE TABLE IF NOT EXISTS analytics_personality_responses (
31
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
32
+ student_id TEXT REFERENCES analytics_students(student_id) ON DELETE CASCADE,
33
+ p_q1 INTEGER CHECK (p_q1 >= 1 AND p_q1 <= 5),
34
+ p_q2 INTEGER CHECK (p_q2 >= 1 AND p_q2 <= 5),
35
+ p_q3 INTEGER CHECK (p_q3 >= 1 AND p_q3 <= 5),
36
+ p_q4 INTEGER CHECK (p_q4 >= 1 AND p_q4 <= 5),
37
+ p_q5 INTEGER CHECK (p_q5 >= 1 AND p_q5 <= 5),
38
+ p_q6 INTEGER CHECK (p_q6 >= 1 AND p_q6 <= 5),
39
+ p_q7 INTEGER CHECK (p_q7 >= 1 AND p_q7 <= 5),
40
+ p_q8 INTEGER CHECK (p_q8 >= 1 AND p_q8 <= 5),
41
+ p_q9 INTEGER CHECK (p_q9 >= 1 AND p_q9 <= 5),
42
+ p_q10 INTEGER CHECK (p_q10 >= 1 AND p_q10 <= 5),
43
+ p_q11 INTEGER CHECK (p_q11 >= 1 AND p_q11 <= 5),
44
+ p_q12 INTEGER CHECK (p_q12 >= 1 AND p_q12 <= 5),
45
+ p_q13 INTEGER CHECK (p_q13 >= 1 AND p_q13 <= 5),
46
+ p_q14 INTEGER CHECK (p_q14 >= 1 AND p_q14 <= 5),
47
+ p_q15 INTEGER CHECK (p_q15 >= 1 AND p_q15 <= 5),
48
+ p_q16 INTEGER CHECK (p_q16 >= 1 AND p_q16 <= 5),
49
+ p_q17 INTEGER CHECK (p_q17 >= 1 AND p_q17 <= 5),
50
+ p_q18 INTEGER CHECK (p_q18 >= 1 AND p_q18 <= 5),
51
+ p_q19 INTEGER CHECK (p_q19 >= 1 AND p_q19 <= 5),
52
+ p_q20 INTEGER CHECK (p_q20 >= 1 AND p_q20 <= 5),
53
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
54
+ UNIQUE(student_id)
55
+ );
56
+
57
+ -- 3. Text Responses Table
58
+ CREATE TABLE IF NOT EXISTS analytics_text_responses (
59
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
60
+ student_id TEXT REFERENCES analytics_students(student_id) ON DELETE CASCADE,
61
+ text_q1 TEXT NOT NULL, -- Strengths
62
+ text_q2 TEXT NOT NULL, -- Career interests
63
+ text_q3 TEXT NOT NULL, -- Extracurriculars + leadership
64
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
65
+ UNIQUE(student_id)
66
+ );
67
+
68
+ -- 4. Domain Evidence Table
69
+ CREATE TABLE IF NOT EXISTS analytics_domain_evidence (
70
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
71
+ student_id TEXT REFERENCES analytics_students(student_id) ON DELETE CASCADE,
72
+ domain_type TEXT NOT NULL CHECK (domain_type IN ('tech', 'business', 'creative', 'research')),
73
+ evidence_data JSONB NOT NULL, -- Flexible storage for domain-specific inputs
74
+ domain_score REAL CHECK (domain_score >= 0 AND domain_score <= 1),
75
+ domain_confidence REAL CHECK (domain_confidence >= 0 AND domain_confidence <= 1),
76
+ raw_features JSONB, -- Raw feature values for explainability
77
+ processing_status TEXT DEFAULT 'pending' CHECK (processing_status IN ('pending', 'processing', 'completed', 'failed')),
78
+ error_message TEXT,
79
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
80
+ updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
81
+ UNIQUE(student_id, domain_type)
82
+ );
83
+
84
+ -- Indexes
85
+ CREATE INDEX IF NOT EXISTS idx_analytics_students_user_id ON analytics_students(user_id);
86
+ CREATE INDEX IF NOT EXISTS idx_personality_student ON analytics_personality_responses(student_id);
87
+ CREATE INDEX IF NOT EXISTS idx_text_student ON analytics_text_responses(student_id);
88
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_student ON analytics_domain_evidence(student_id);
89
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_type ON analytics_domain_evidence(domain_type);
90
+ CREATE INDEX IF NOT EXISTS idx_domain_evidence_status ON analytics_domain_evidence(processing_status);
91
+
92
+ -- RLS Policies
93
+ ALTER TABLE analytics_students ENABLE ROW LEVEL SECURITY;
94
+ ALTER TABLE analytics_personality_responses ENABLE ROW LEVEL SECURITY;
95
+ ALTER TABLE analytics_text_responses ENABLE ROW LEVEL SECURITY;
96
+
97
+ -- Students can view/update their own data
98
+ CREATE POLICY "Users can view own analytics data"
99
+ ON analytics_students FOR SELECT
100
+ TO authenticated
101
+ USING (user_id = auth.uid());
102
+
103
+ CREATE POLICY "Users can insert own analytics data"
104
+ ON analytics_students FOR INSERT
105
+ TO authenticated
106
+ WITH CHECK (user_id = auth.uid());
107
+
108
+ CREATE POLICY "Users can update own analytics data"
109
+ ON analytics_students FOR UPDATE
110
+ TO authenticated
111
+ USING (user_id = auth.uid());
112
+
113
+ -- Personality responses
114
+ CREATE POLICY "Users can view own personality responses"
115
+ ON analytics_personality_responses FOR SELECT
116
+ TO authenticated
117
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
118
+
119
+ CREATE POLICY "Users can insert own personality responses"
120
+ ON analytics_personality_responses FOR INSERT
121
+ -- Text responses
122
+ CREATE POLICY "Users can view own text responses"
123
+ ON analytics_text_responses FOR SELECT
124
+ TO authenticated
125
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
126
+
127
+ CREATE POLICY "Users can insert own text responses"
128
+ ON analytics_text_responses FOR INSERT
129
+ TO authenticated
130
+ WITH CHECK (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
131
+
132
+ -- Domain evidence
133
+ ALTER TABLE analytics_domain_evidence ENABLE ROW LEVEL SECURITY;
134
+
135
+ CREATE POLICY "Users can view own domain evidence"
136
+ ON analytics_domain_evidence FOR SELECT
137
+ TO authenticated
138
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
139
+
140
+ CREATE POLICY "Users can insert own domain evidence"
141
+ ON analytics_domain_evidence FOR INSERT
142
+ TO authenticated
143
+ WITH CHECK (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
144
+
145
+ CREATE POLICY "Users can update own domain evidence"
146
+ ON analytics_domain_evidence FOR UPDATE
147
+ TO authenticated
148
+ USING (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
149
+ ON analytics_text_responses FOR INSERT
150
+ TO authenticated
151
+ WITH CHECK (student_id IN (SELECT student_id FROM analytics_students WHERE user_id = auth.uid()));
domains/data_science.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "domain_id": "data_science",
3
+ "display_name": "Data Science & Analytics",
4
+ "description": "Machine Learning, Data Analysis, AI Research, and Business Intelligence",
5
+ "core_skills": [
6
+ "python",
7
+ "r",
8
+ "sql",
9
+ "pandas",
10
+ "numpy",
11
+ "scikit_learn",
12
+ "tensorflow",
13
+ "pytorch",
14
+ "keras",
15
+ "xgboost",
16
+ "tableau",
17
+ "power_bi",
18
+ "matplotlib",
19
+ "seaborn",
20
+ "statistics",
21
+ "ab_testing",
22
+ "feature_engineering",
23
+ "spark",
24
+ "hadoop",
25
+ "airflow",
26
+ "dbt"
27
+ ],
28
+ "aspect_prototypes": {
29
+ "technical_skills": [
30
+ "built machine learning models using scikit-learn and XGBoost",
31
+ "developed deep learning pipelines with PyTorch",
32
+ "created ETL jobs using PySpark for big data processing",
33
+ "trained neural networks for image classification",
34
+ "implemented NLP models using transformers and BERT",
35
+ "designed feature engineering pipelines for ML",
36
+ "built recommendation systems using collaborative filtering",
37
+ "deployed ML models to production with MLflow",
38
+ "created interactive dashboards in Tableau",
39
+ "performed A/B testing with statistical significance analysis"
40
+ ],
41
+ "problem_solving": [
42
+ "improved model accuracy from 78% to 92% through feature engineering",
43
+ "reduced model training time by 60% using distributed computing",
44
+ "diagnosed and fixed data leakage in ML pipeline",
45
+ "optimized hyperparameters using Bayesian optimization",
46
+ "handled class imbalance with SMOTE and weighted sampling",
47
+ "debugged data quality issues affecting model performance",
48
+ "designed experiment to measure causal impact of recommendation",
49
+ "created automated anomaly detection system",
50
+ "resolved data drift issues in production models",
51
+ "built interpretable models for regulatory compliance"
52
+ ],
53
+ "leadership": [
54
+ "led data science team of 4 on personalization project",
55
+ "presented ML insights to C-level stakeholders",
56
+ "coordinated with engineering for model deployment",
57
+ "organized data science reading group in company",
58
+ "mentored junior analysts on SQL and Python",
59
+ "drove adoption of MLOps best practices",
60
+ "led cross-functional project with marketing team",
61
+ "managed data labeling team for annotation project",
62
+ "conducted training sessions on Pandas for analysts",
63
+ "championed experiment-driven decision making culture"
64
+ ],
65
+ "internship_experience": [
66
+ "data science intern at Flipkart building recommendation models",
67
+ "ML research intern at Google Brain working on NLP",
68
+ "analytics intern at McKinsey for retail optimization",
69
+ "AI intern at NVIDIA on computer vision projects",
70
+ "research intern at IISc on deep learning",
71
+ "data analyst intern at Zomato for demand forecasting",
72
+ "business intelligence intern at Amazon building dashboards",
73
+ "ML platform intern at Meta for model serving",
74
+ "quantitative research intern at Goldman Sachs",
75
+ "applied scientist intern at AWS on personalization"
76
+ ]
77
+ },
78
+ "industry_benchmarks": {
79
+ "min_employability_score": 0.65,
80
+ "expected_cgpa": 8.0,
81
+ "expected_internship_months": 4,
82
+ "critical_skills": [
83
+ "python",
84
+ "sql",
85
+ "statistics",
86
+ "ml_fundamentals"
87
+ ],
88
+ "nice_to_have_skills": [
89
+ "deep_learning",
90
+ "spark",
91
+ "mlops",
92
+ "cloud"
93
+ ]
94
+ },
95
+ "skill_gaps_mapping": {
96
+ "deep_learning": {
97
+ "demand_score": 0.80,
98
+ "courses": [
99
+ "Deep Learning Specialization",
100
+ "Fast.ai",
101
+ "Stanford CS231n"
102
+ ],
103
+ "certifications": [
104
+ "TensorFlow Developer",
105
+ "PyTorch Certified"
106
+ ]
107
+ },
108
+ "mlops": {
109
+ "demand_score": 0.75,
110
+ "courses": [
111
+ "MLOps Specialization",
112
+ "ML Engineering for Production"
113
+ ],
114
+ "certifications": [
115
+ "AWS ML Specialty",
116
+ "GCP ML Engineer"
117
+ ]
118
+ },
119
+ "statistics": {
120
+ "demand_score": 0.70,
121
+ "courses": [
122
+ "Statistics for Data Science",
123
+ "A/B Testing Masterclass"
124
+ ],
125
+ "certifications": []
126
+ },
127
+ "big_data": {
128
+ "demand_score": 0.65,
129
+ "courses": [
130
+ "Spark for Data Engineering",
131
+ "Databricks Academy"
132
+ ],
133
+ "certifications": [
134
+ "Databricks Certified",
135
+ "Cloudera CCA"
136
+ ]
137
+ }
138
+ },
139
+ "detection_keywords": [
140
+ "data science",
141
+ "machine learning",
142
+ "deep learning",
143
+ "ai",
144
+ "analytics",
145
+ "data analyst",
146
+ "ml engineer",
147
+ "research scientist",
148
+ "business intelligence",
149
+ "statistical modeling",
150
+ "predictive analytics",
151
+ "data mining"
152
+ ]
153
+ }
domains/mechanical_engineering.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "domain_id": "mechanical_engineering",
3
+ "display_name": "Mechanical Engineering",
4
+ "description": "Design, Manufacturing, Automotive, and Core Engineering",
5
+ "core_skills": [
6
+ "autocad",
7
+ "solidworks",
8
+ "catia",
9
+ "ansys",
10
+ "matlab",
11
+ "thermodynamics",
12
+ "fluid_mechanics",
13
+ "heat_transfer",
14
+ "manufacturing",
15
+ "cnc",
16
+ "3d_printing",
17
+ "gd_t",
18
+ "fea",
19
+ "cfd",
20
+ "product_design",
21
+ "quality_control"
22
+ ],
23
+ "aspect_prototypes": {
24
+ "technical_skills": [
25
+ "designed complex assemblies in SolidWorks and CATIA",
26
+ "performed FEA analysis using ANSYS for structural optimization",
27
+ "created CFD simulations for fluid flow optimization",
28
+ "developed CNC programs for precision machining",
29
+ "implemented GD&T for manufacturing tolerances",
30
+ "designed heat exchangers using thermal analysis",
31
+ "prototyped parts using 3D printing and rapid prototyping",
32
+ "conducted DFMEA for product reliability",
33
+ "created engineering drawings following ASME standards",
34
+ "optimized product design reducing weight by 20%"
35
+ ],
36
+ "problem_solving": [
37
+ "resolved vibration issue in rotating machinery",
38
+ "optimized manufacturing process reducing cycle time by 30%",
39
+ "diagnosed failure mode using root cause analysis",
40
+ "redesigned component eliminating stress concentration",
41
+ "improved product yield from 85% to 98% through quality control",
42
+ "solved thermal management problem in electronic enclosure",
43
+ "reduced material waste by 25% through lean manufacturing",
44
+ "fixed tolerance stack-up issue causing assembly problems",
45
+ "automated inspection process using machine vision",
46
+ "designed jig and fixture reducing setup time"
47
+ ],
48
+ "leadership": [
49
+ "led BAJA SAE team of 20 members as captain",
50
+ "managed product development project from concept to production",
51
+ "coordinated with suppliers for component sourcing",
52
+ "organized SAE chapter events with 200+ participants",
53
+ "mentored junior designers on CAD and simulation tools",
54
+ "led quality improvement initiative on production floor",
55
+ "managed cross-functional team for product launch",
56
+ "conducted design reviews with stakeholders",
57
+ "led vendor qualification and development program",
58
+ "organized technical workshops on new manufacturing methods"
59
+ ],
60
+ "internship_experience": [
61
+ "6 months design intern at Tata Motors in R&D division",
62
+ "summer internship at Mahindra on EV powertrain",
63
+ "manufacturing intern at L&T in heavy engineering",
64
+ "R&D intern at Bosch on automotive components",
65
+ "product design intern at Godrej appliances division",
66
+ "quality engineering intern at Maruti Suzuki",
67
+ "CAE analyst intern at TAFE for tractor design",
68
+ "tool design intern at Hero MotoCorp",
69
+ "thermal analysis intern at Thermax",
70
+ "research intern at IIT Madras on composite materials"
71
+ ]
72
+ },
73
+ "industry_benchmarks": {
74
+ "min_employability_score": 0.55,
75
+ "expected_cgpa": 7.0,
76
+ "expected_internship_months": 3,
77
+ "critical_skills": [
78
+ "cad",
79
+ "manufacturing_basics",
80
+ "engineering_drawing"
81
+ ],
82
+ "nice_to_have_skills": [
83
+ "fea",
84
+ "cfd",
85
+ "python",
86
+ "automation"
87
+ ]
88
+ },
89
+ "skill_gaps_mapping": {
90
+ "cae_simulation": {
91
+ "demand_score": 0.70,
92
+ "courses": [
93
+ "ANSYS Certification",
94
+ "CATIA V5 Mastery"
95
+ ],
96
+ "certifications": [
97
+ "CSWA",
98
+ "CSWP",
99
+ "ANSYS Certified"
100
+ ]
101
+ },
102
+ "ev_powertrain": {
103
+ "demand_score": 0.75,
104
+ "courses": [
105
+ "Electric Vehicle Technology",
106
+ "Battery Management Systems"
107
+ ],
108
+ "certifications": [
109
+ "EV Design Certification"
110
+ ]
111
+ },
112
+ "automation": {
113
+ "demand_score": 0.65,
114
+ "courses": [
115
+ "Industrial Automation",
116
+ "PLC Programming"
117
+ ],
118
+ "certifications": [
119
+ "Siemens TIA Portal",
120
+ "Allen Bradley"
121
+ ]
122
+ },
123
+ "industry_4_0": {
124
+ "demand_score": 0.60,
125
+ "courses": [
126
+ "IoT for Manufacturing",
127
+ "Digital Twin Technology"
128
+ ],
129
+ "certifications": [
130
+ "Industry 4.0 Certification"
131
+ ]
132
+ }
133
+ },
134
+ "detection_keywords": [
135
+ "mechanical",
136
+ "design engineer",
137
+ "manufacturing",
138
+ "automotive",
139
+ "product design",
140
+ "cad",
141
+ "solidworks",
142
+ "catia",
143
+ "ansys",
144
+ "thermodynamics",
145
+ "heat transfer",
146
+ "fluid mechanics"
147
+ ]
148
+ }
domains/software_engineering.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "domain_id": "software_engineering",
3
+ "display_name": "Software Engineering",
4
+ "description": "Backend, Frontend, Full-stack, DevOps, and general software development",
5
+ "core_skills": [
6
+ "python",
7
+ "java",
8
+ "javascript",
9
+ "typescript",
10
+ "golang",
11
+ "c++",
12
+ "react",
13
+ "nodejs",
14
+ "django",
15
+ "spring",
16
+ "fastapi",
17
+ "flask",
18
+ "sql",
19
+ "postgresql",
20
+ "mongodb",
21
+ "redis",
22
+ "docker",
23
+ "kubernetes",
24
+ "aws",
25
+ "gcp",
26
+ "azure",
27
+ "git",
28
+ "ci_cd",
29
+ "testing",
30
+ "system_design"
31
+ ],
32
+ "aspect_prototypes": {
33
+ "technical_skills": [
34
+ "developed RESTful APIs using FastAPI",
35
+ "built microservices architecture with Docker",
36
+ "implemented CI/CD pipelines for automated deployment",
37
+ "designed database schemas for high-traffic applications",
38
+ "wrote production-grade Python code with unit tests",
39
+ "deployed applications to AWS using ECS and Lambda",
40
+ "built React frontend with Redux state management",
41
+ "optimized SQL queries reducing latency by 50%",
42
+ "implemented caching layer with Redis",
43
+ "created data pipelines using Apache Airflow"
44
+ ],
45
+ "problem_solving": [
46
+ "debugged production outage affecting 10K users",
47
+ "optimized algorithm complexity from O(n²) to O(n log n)",
48
+ "resolved memory leak in long-running service",
49
+ "diagnosed and fixed race condition in concurrent code",
50
+ "designed fallback strategy for external API failures",
51
+ "reduced P95 latency from 500ms to 100ms",
52
+ "automated manual deployment reducing errors by 80%",
53
+ "created monitoring dashboards to detect issues early",
54
+ "refactored legacy codebase improving maintainability",
55
+ "implemented retry logic with exponential backoff"
56
+ ],
57
+ "leadership": [
58
+ "led a team of 5 engineers on product launch",
59
+ "managed sprint planning and backlog prioritization",
60
+ "conducted code reviews for junior developers",
61
+ "organized weekly tech talks for knowledge sharing",
62
+ "coordinated cross-team integration project",
63
+ "mentored 3 interns during summer program",
64
+ "drove architectural decisions for new microservice",
65
+ "led incident response during production outage",
66
+ "facilitated retrospectives improving team velocity",
67
+ "championed adoption of testing best practices"
68
+ ],
69
+ "internship_experience": [
70
+ "6 months SWE intern at Google building recommendation systems",
71
+ "summer internship at Microsoft on Azure DevOps team",
72
+ "3 months ML intern at startup developing NLP models",
73
+ "backend engineering intern at Stripe working on payments",
74
+ "full-stack intern at Flipkart building seller dashboard",
75
+ "DevOps intern at Infosys implementing CI/CD",
76
+ "research intern at IIT Bombay on distributed systems",
77
+ "mobile development intern at Zomato for Android app",
78
+ "data engineering intern at Razorpay building pipelines",
79
+ "platform intern at Amazon working on internal tools"
80
+ ]
81
+ },
82
+ "industry_benchmarks": {
83
+ "min_employability_score": 0.60,
84
+ "expected_cgpa": 7.5,
85
+ "expected_internship_months": 4,
86
+ "critical_skills": [
87
+ "python",
88
+ "sql",
89
+ "git",
90
+ "system_design"
91
+ ],
92
+ "nice_to_have_skills": [
93
+ "kubernetes",
94
+ "aws",
95
+ "redis",
96
+ "graphql"
97
+ ]
98
+ },
99
+ "skill_gaps_mapping": {
100
+ "cloud": {
101
+ "demand_score": 0.85,
102
+ "courses": [
103
+ "AWS Solutions Architect",
104
+ "GCP Professional",
105
+ "Azure Fundamentals"
106
+ ],
107
+ "certifications": [
108
+ "AWS SAA",
109
+ "GCP ACE",
110
+ "Azure AZ-900"
111
+ ]
112
+ },
113
+ "devops": {
114
+ "demand_score": 0.80,
115
+ "courses": [
116
+ "Docker Mastery",
117
+ "Kubernetes for Developers",
118
+ "CI/CD with GitHub Actions"
119
+ ],
120
+ "certifications": [
121
+ "CKA",
122
+ "Docker DCA",
123
+ "Jenkins Certified"
124
+ ]
125
+ },
126
+ "system_design": {
127
+ "demand_score": 0.75,
128
+ "courses": [
129
+ "Grokking System Design",
130
+ "Designing Data-Intensive Applications"
131
+ ],
132
+ "certifications": []
133
+ },
134
+ "dsa": {
135
+ "demand_score": 0.70,
136
+ "courses": [
137
+ "LeetCode Patterns",
138
+ "AlgoExpert",
139
+ "NeetCode 150"
140
+ ],
141
+ "certifications": []
142
+ }
143
+ },
144
+ "detection_keywords": [
145
+ "software",
146
+ "developer",
147
+ "engineer",
148
+ "backend",
149
+ "frontend",
150
+ "fullstack",
151
+ "web development",
152
+ "api",
153
+ "microservice",
154
+ "devops",
155
+ "sre",
156
+ "platform",
157
+ "coding",
158
+ "programming",
159
+ "python developer",
160
+ "java developer"
161
+ ]
162
+ }
models/personality_responses.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Personality responses model"""
2
+ from typing import Dict, List
3
+ from dataclasses import dataclass
4
+
5
+ @dataclass
6
+ class PersonalityResponses:
7
+ student_id: str
8
+ responses: Dict[str, int] # p_q1: 1-5, p_q2: 1-5, etc.
9
+
10
+ def to_dict(self):
11
+ data = {'student_id': self.student_id}
12
+ data.update(self.responses)
13
+ return data
14
+
15
+ @staticmethod
16
+ def get_questions() -> List[Dict[str, str]]:
17
+ """Return 20 curated personality questions mapped to Big Five traits"""
18
+ return [
19
+ # Openness (4 questions)
20
+ {"id": "p_q1", "text": "I enjoy exploring new ideas and concepts", "trait": "openness"},
21
+ {"id": "p_q2", "text": "I prefer routine over spontaneity", "trait": "openness_r"},
22
+ {"id": "p_q3", "text": "I am curious about many different things", "trait": "openness"},
23
+ {"id": "p_q4", "text": "I appreciate art and creative expression", "trait": "openness"},
24
+
25
+ # Conscientiousness (4 questions)
26
+ {"id": "p_q5", "text": "I am highly organized and plan ahead", "trait": "conscientiousness"},
27
+ {"id": "p_q6", "text": "I often procrastinate on tasks", "trait": "conscientiousness_r"},
28
+ {"id": "p_q7", "text": "I pay attention to details", "trait": "conscientiousness"},
29
+ {"id": "p_q8", "text": "I complete tasks on time", "trait": "conscientiousness"},
30
+
31
+ # Extraversion (4 questions)
32
+ {"id": "p_q9", "text": "I enjoy being the center of attention", "trait": "extraversion"},
33
+ {"id": "p_q10", "text": "I prefer working alone", "trait": "extraversion_r"},
34
+ {"id": "p_q11", "text": "I make friends easily", "trait": "extraversion"},
35
+ {"id": "p_q12", "text": "I am energized by social interactions", "trait": "extraversion"},
36
+
37
+ # Agreeableness (4 questions)
38
+ {"id": "p_q13", "text": "I am considerate of others' feelings", "trait": "agreeableness"},
39
+ {"id": "p_q14", "text": "I prefer competition over collaboration", "trait": "agreeableness_r"},
40
+ {"id": "p_q15", "text": "I trust people easily", "trait": "agreeableness"},
41
+ {"id": "p_q16", "text": "I help others when they need it", "trait": "agreeableness"},
42
+
43
+ # Emotional Stability (4 questions)
44
+ {"id": "p_q17", "text": "I handle stress well", "trait": "stability"},
45
+ {"id": "p_q18", "text": "I often feel anxious", "trait": "stability_r"},
46
+ {"id": "p_q19", "text": "I remain calm under pressure", "trait": "stability"},
47
+ {"id": "p_q20", "text": "I recover quickly from setbacks", "trait": "stability"},
48
+ ]
models/student.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Student data model"""
2
+ from typing import Optional
3
+ from dataclasses import dataclass
4
+
5
+ @dataclass
6
+ class Student:
7
+ student_id: str
8
+ user_id: Optional[str]
9
+ cgpa: float
10
+ sgpa_sem1: Optional[float]
11
+ sgpa_sem2: Optional[float]
12
+ sgpa_sem3: Optional[float]
13
+ sgpa_sem4: Optional[float]
14
+ sgpa_sem5: Optional[float]
15
+ sgpa_sem6: Optional[float]
16
+ sgpa_sem7: Optional[float]
17
+ sgpa_sem8: Optional[float]
18
+ tenth_pct: Optional[float]
19
+ twelfth_pct: Optional[float]
20
+ extracurricular_text: Optional[str] = None
21
+ certifications_text: Optional[str] = None
22
+ internship_text: Optional[str] = None
23
+
24
+ def to_dict(self):
25
+ return {
26
+ 'student_id': self.student_id,
27
+ 'user_id': self.user_id,
28
+ 'cgpa': self.cgpa,
29
+ 'sgpa_sem1': self.sgpa_sem1,
30
+ 'sgpa_sem2': self.sgpa_sem2,
31
+ 'sgpa_sem3': self.sgpa_sem3,
32
+ 'sgpa_sem4': self.sgpa_sem4,
33
+ 'sgpa_sem5': self.sgpa_sem5,
34
+ 'sgpa_sem6': self.sgpa_sem6,
35
+ 'sgpa_sem7': self.sgpa_sem7,
36
+ 'sgpa_sem8': self.sgpa_sem8,
37
+ 'tenth_pct': self.tenth_pct,
38
+ 'twelfth_pct': self.twelfth_pct,
39
+ 'extracurricular_text': self.extracurricular_text,
40
+ 'certifications_text': self.certifications_text,
41
+ 'internship_text': self.internship_text
42
+ }
models/text_responses.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text responses model"""
2
+ from typing import List, Dict
3
+ from dataclasses import dataclass
4
+
5
+ @dataclass
6
+ class TextResponses:
7
+ student_id: str
8
+ text_q1: str # Strengths
9
+ text_q2: str # Career interests
10
+ text_q3: str # Extracurriculars + leadership
11
+
12
+ def to_dict(self):
13
+ return {
14
+ 'student_id': self.student_id,
15
+ 'text_q1': self.text_q1,
16
+ 'text_q2': self.text_q2,
17
+ 'text_q3': self.text_q3
18
+ }
19
+
20
+ @staticmethod
21
+ def get_questions() -> List[Dict[str, str]]:
22
+ """Return the 3 textual questions"""
23
+ return [
24
+ {
25
+ "id": "text_q1",
26
+ "text": "What are your key strengths and technical skills? (150-300 words)",
27
+ "placeholder": "Describe your technical skills, soft skills, and what makes you stand out..."
28
+ },
29
+ {
30
+ "id": "text_q2",
31
+ "text": "What are your career interests and goals? (150-300 words)",
32
+ "placeholder": "Describe your ideal career path, industries of interest, and long-term goals..."
33
+ },
34
+ {
35
+ "id": "text_q3",
36
+ "text": "Describe your extracurricular activities and leadership experiences. (150-300 words)",
37
+ "placeholder": "Share your involvement in clubs, projects, leadership roles, and impact..."
38
+ }
39
+ ]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==3.0.0
2
+ Flask-CORS==4.0.0
3
+ supabase==2.9.0
4
+ websockets>=15.0.1
5
+ sentence-transformers>=2.2.0
6
+ numpy>=1.24.0
7
+ pandas>=2.0.0
8
+ scikit-learn>=1.3.0
9
+ python-dotenv==1.0.0
10
+ gunicorn==21.2.0
11
+ torch>=2.0.0
12
+ transformers>=4.30.0
routes/domain.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Domain Evidence Routes
2
+
3
+ API endpoints for submitting and managing domain-specific evidence
4
+ """
5
+ from flask import Blueprint, request, jsonify
6
+ import logging
7
+ from database.db import get_db
8
+ from services.domain_plugins import DomainPluginFactory
9
+ from services.domain_plugins.tech_plugin import TechPlugin
10
+ from services.domain_plugins.business_plugin import BusinessPlugin
11
+ from services.domain_plugins.creative_plugin import CreativePlugin
12
+ from services.domain_plugins.research_plugin import ResearchPlugin
13
+
14
+ domain_bp = Blueprint('domain', __name__)
15
+ db = get_db()
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @domain_bp.route('/domain/available', methods=['GET'])
20
+ def list_available_domains():
21
+ """List all available domain plugins"""
22
+ try:
23
+ domains = DomainPluginFactory.list_available_domains()
24
+
25
+ # Get detailed info for each domain
26
+ domain_info = []
27
+ for domain_type in domains:
28
+ info = DomainPluginFactory.get_domain_info(domain_type)
29
+ if info:
30
+ domain_info.append(info)
31
+
32
+ return jsonify({
33
+ 'success': True,
34
+ 'domains': domain_info
35
+ }), 200
36
+
37
+ except Exception as e:
38
+ logger.error(f"Error listing domains: {e}")
39
+ return jsonify({'error': str(e)}), 500
40
+
41
+
42
+ @domain_bp.route('/students/<student_id>/domain-evidence', methods=['POST'])
43
+ def submit_domain_evidence(student_id):
44
+ """Submit domain-specific evidence for scoring"""
45
+ try:
46
+ data = request.json
47
+ domain_type = data.get('domain_type')
48
+ evidence_data = data.get('evidence_data', {})
49
+
50
+ # Validate domain type
51
+ if not DomainPluginFactory.is_domain_available(domain_type):
52
+ return jsonify({
53
+ 'error': f'Invalid domain type: {domain_type}',
54
+ 'available_domains': DomainPluginFactory.list_available_domains()
55
+ }), 400
56
+
57
+ # Get plugin
58
+ plugin = DomainPluginFactory.get_plugin(domain_type)
59
+
60
+ # Validate inputs
61
+ is_valid, error_msg = plugin.validate_inputs(evidence_data)
62
+ if not is_valid:
63
+ return jsonify({'error': error_msg}), 400
64
+
65
+ # Check if student exists
66
+ student_check = db.table('analytics_students').select('student_id').eq('student_id', student_id).execute()
67
+ if not student_check.data:
68
+ return jsonify({'error': 'Student not found'}), 404
69
+
70
+ # Score the evidence
71
+ logger.info(f"Scoring {domain_type} evidence for student {student_id}")
72
+ domain_score = plugin.score(evidence_data)
73
+
74
+ # Store evidence and score
75
+ evidence_record = {
76
+ 'student_id': student_id,
77
+ 'domain_type': domain_type,
78
+ 'evidence_data': evidence_data,
79
+ 'domain_score': domain_score.score,
80
+ 'domain_confidence': domain_score.confidence,
81
+ 'raw_features': domain_score.raw_features,
82
+ 'processing_status': 'completed'
83
+ }
84
+
85
+ result = db.table('analytics_domain_evidence').upsert(evidence_record).execute()
86
+
87
+ # Update student's active domain and cached scores
88
+ student_update = {
89
+ 'student_id': student_id,
90
+ 'active_domain': domain_type,
91
+ 'domain_score': domain_score.score,
92
+ 'domain_confidence': domain_score.confidence
93
+ }
94
+
95
+ db.table('analytics_students').upsert(student_update).execute()
96
+
97
+ logger.info(f"Domain evidence submitted successfully: {domain_type} score = {domain_score.score:.3f}")
98
+
99
+ return jsonify({
100
+ 'success': True,
101
+ 'domain_score': domain_score.to_dict(),
102
+ 'message': f'{domain_type.capitalize()} domain evidence processed successfully'
103
+ }), 201
104
+
105
+ except Exception as e:
106
+ logger.error(f"Error submitting domain evidence: {e}")
107
+ import traceback
108
+ traceback.print_exc()
109
+
110
+ # Store error in database
111
+ try:
112
+ error_record = {
113
+ 'student_id': student_id,
114
+ 'domain_type': domain_type,
115
+ 'evidence_data': evidence_data,
116
+ 'processing_status': 'failed',
117
+ 'error_message': str(e)
118
+ }
119
+ db.table('analytics_domain_evidence').upsert(error_record).execute()
120
+ except:
121
+ pass
122
+
123
+ return jsonify({'error': str(e)}), 500
124
+
125
+
126
+ @domain_bp.route('/students/<student_id>/domain-evidence', methods=['GET'])
127
+ def get_domain_evidence(student_id):
128
+ """Get domain evidence for a student"""
129
+ try:
130
+ result = db.table('analytics_domain_evidence').select('*').eq('student_id', student_id).execute()
131
+
132
+ if not result.data:
133
+ return jsonify({
134
+ 'success': True,
135
+ 'evidence': [],
136
+ 'message': 'No domain evidence found'
137
+ }), 200
138
+
139
+ return jsonify({
140
+ 'success': True,
141
+ 'evidence': result.data
142
+ }), 200
143
+
144
+ except Exception as e:
145
+ logger.error(f"Error retrieving domain evidence: {e}")
146
+ return jsonify({'error': str(e)}), 500
147
+
148
+
149
+ @domain_bp.route('/students/<student_id>/domain-evidence/<domain_type>', methods=['GET'])
150
+ def get_specific_domain_evidence(student_id, domain_type):
151
+ """Get specific domain evidence for a student"""
152
+ try:
153
+ result = db.table('analytics_domain_evidence')\
154
+ .select('*')\
155
+ .eq('student_id', student_id)\
156
+ .eq('domain_type', domain_type)\
157
+ .execute()
158
+
159
+ if not result.data:
160
+ return jsonify({
161
+ 'error': f'No {domain_type} evidence found for student {student_id}'
162
+ }), 404
163
+
164
+ return jsonify({
165
+ 'success': True,
166
+ 'evidence': result.data[0]
167
+ }), 200
168
+
169
+ except Exception as e:
170
+ logger.error(f"Error retrieving domain evidence: {e}")
171
+ return jsonify({'error': str(e)}), 500
172
+
173
+
174
+ @domain_bp.route('/students/<student_id>/domain-evidence/<domain_type>', methods=['DELETE'])
175
+ def delete_domain_evidence(student_id, domain_type):
176
+ """Delete domain evidence for a student"""
177
+ try:
178
+ # Delete evidence record
179
+ db.table('analytics_domain_evidence')\
180
+ .delete()\
181
+ .eq('student_id', student_id)\
182
+ .eq('domain_type', domain_type)\
183
+ .execute()
184
+
185
+ # Update student to clear active domain if it matches
186
+ student = db.table('analytics_students').select('active_domain').eq('student_id', student_id).execute()
187
+
188
+ if student.data and student.data[0].get('active_domain') == domain_type:
189
+ db.table('analytics_students').update({
190
+ 'active_domain': None,
191
+ 'domain_score': None,
192
+ 'domain_confidence': None
193
+ }).eq('student_id', student_id).execute()
194
+
195
+ return jsonify({
196
+ 'success': True,
197
+ 'message': f'{domain_type.capitalize()} evidence deleted'
198
+ }), 200
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error deleting domain evidence: {e}")
202
+ return jsonify({'error': str(e)}), 500
routes/scoring.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scoring routes"""
2
+ from flask import Blueprint, jsonify
3
+ from database.db import get_db
4
+ from services.universal_module import UniversalModule
5
+ from services.personality_module import PersonalityModule
6
+ from services.text_module import TextModule
7
+ from services.fusion import FusionEngine
8
+
9
+ scoring_bp = Blueprint('scoring', __name__)
10
+ db = get_db()
11
+
12
+ # Initialize modules
13
+ universal_module = UniversalModule()
14
+ personality_module = PersonalityModule()
15
+ text_module = TextModule()
16
+ fusion_engine = FusionEngine()
17
+
18
+
19
+ @scoring_bp.route('/score/<student_id>', methods=['GET'])
20
+ def get_student_score(student_id):
21
+ """
22
+ Compute and return full scoring packet for a student
23
+ """
24
+ try:
25
+ # 1. Fetch student data
26
+ student_result = db.table('analytics_students').select('*').eq('student_id', student_id).single().execute()
27
+ if not student_result.data:
28
+ return jsonify({'error': 'Student not found'}), 404
29
+
30
+ student_data = student_result.data
31
+
32
+ # 2. Fetch personality responses
33
+ personality_result = db.table('analytics_personality_responses').select('*').eq('student_id', student_id).maybe_single().execute()
34
+ personality_responses = {}
35
+ if personality_result.data:
36
+ personality_responses = {k: v for k, v in personality_result.data.items() if k.startswith('p_q')}
37
+
38
+ # 3. Fetch text responses
39
+ text_result = db.table('analytics_text_responses').select('*').eq('student_id', student_id).maybe_single().execute()
40
+ text_responses = {}
41
+ if text_result.data:
42
+ text_responses = {
43
+ 'text_q1': text_result.data.get('text_q1', ''),
44
+ 'text_q2': text_result.data.get('text_q2', ''),
45
+ 'text_q3': text_result.data.get('text_q3', '')
46
+ }
47
+
48
+ # 3.5. Fetch domain evidence (if exists)
49
+ domain_score = None
50
+ domain_confidence = None
51
+ domain_type = None
52
+ domain_features = {}
53
+
54
+ if student_data.get('active_domain'):
55
+ domain_type = student_data.get('active_domain')
56
+ domain_score = student_data.get('domain_score')
57
+ domain_confidence = student_data.get('domain_confidence')
58
+
59
+ # Fetch detailed domain evidence
60
+ domain_result = db.table('analytics_domain_evidence')\
61
+ .select('*')\
62
+ .eq('student_id', student_id)\
63
+ .eq('domain_type', domain_type)\
64
+ .maybe_single()\
65
+ .execute()
66
+
67
+ if domain_result.data:
68
+ domain_features = domain_result.data.get('raw_features', {})
69
+
70
+ # 4. Calculate universal score
71
+ universal_score, universal_confidence, universal_features = universal_module.score(student_data)
72
+ universal_explanations = universal_module.explain(universal_features)
73
+
74
+ # 5. Calculate personality score
75
+ personality_score, personality_confidence, personality_traits = personality_module.score(personality_responses)
76
+ personality_explanations = personality_module.explain(personality_traits)
77
+
78
+ # 6. Calculate text score
79
+ text_score, text_confidence, text_features = text_module.score(text_responses)
80
+ text_explanations = text_module.explain(text_features)
81
+
82
+ # 7. Fuse scores (with optional domain score)
83
+ final_score, breakdown = fusion_engine.fuse_scores(
84
+ universal_score, universal_confidence,
85
+ personality_score, personality_confidence,
86
+ text_score, text_confidence,
87
+ domain_score, domain_confidence
88
+ )
89
+
90
+ # 8. Get grade and percentile
91
+ grade = fusion_engine.get_grade(final_score)
92
+ percentile = fusion_engine.get_percentile(final_score)
93
+
94
+ # 9. Prepare response
95
+ response = {
96
+ 'student_id': student_id,
97
+ 'final_score': round(final_score, 4),
98
+ 'grade': grade,
99
+ 'percentile': percentile,
100
+ 'scores': breakdown,
101
+ 'explanations': {
102
+ 'universal': universal_explanations,
103
+ 'personality': personality_explanations,
104
+ 'text': text_explanations
105
+ },
106
+ 'detailed_features': {
107
+ 'universal': {k: round(v, 3) for k, v in universal_features.items()},
108
+ 'personality': {k: round(v, 3) for k, v in personality_traits.items()},
109
+ 'text': {k: round(v, 3) for k, v in text_features.items()}
110
+ },
111
+ 'data_completeness': {
112
+ 'universal': universal_confidence,
113
+ 'personality': personality_confidence,
114
+ 'text': text_confidence
115
+ }
116
+ }
117
+
118
+ # Add domain information if present
119
+ if domain_type:
120
+ response['domain_type'] = domain_type
121
+ response['detailed_features']['domain'] = {k: round(v, 3) for k, v in domain_features.items()}
122
+ response['data_completeness']['domain'] = domain_confidence
123
+ response['explanations']['domain'] = {
124
+ 'message': f'{domain_type.capitalize()} domain evidence provided',
125
+ 'features': domain_features
126
+ }
127
+ else:
128
+ response['domain_type'] = None
129
+ response['explanations']['domain'] = {
130
+ 'message': 'No domain-specific evidence submitted. Submit GitHub/portfolio/resume for enhanced scoring.'
131
+ }
132
+
133
+ return jsonify({
134
+ 'success': True,
135
+ 'data': response
136
+ }), 200
137
+
138
+ except Exception as e:
139
+ import traceback
140
+ traceback.print_exc()
141
+ return jsonify({'error': str(e)}), 500
142
+
143
+
144
+ @scoring_bp.route('/leaderboard', methods=['GET'])
145
+ def get_leaderboard():
146
+ """
147
+ Get top students by score (mock for MVP)
148
+ In production, this would compute and cache scores
149
+ """
150
+ try:
151
+ # Fetch all students
152
+ students = db.table('analytics_students').select('*').execute()
153
+
154
+ leaderboard = []
155
+ for student in students.data[:10]: # Limit to 10 for MVP
156
+ try:
157
+ # Quick score calculation
158
+ student_id = student['student_id']
159
+
160
+ # Get personality
161
+ personality_result = db.table('analytics_personality_responses').select('*').eq('student_id', student_id).maybe_single().execute()
162
+ personality_responses = {}
163
+ if personality_result.data:
164
+ personality_responses = {k: v for k, v in personality_result.data.items() if k.startswith('p_q')}
165
+
166
+ # Get text
167
+ text_result = db.table('analytics_text_responses').select('*').eq('student_id', student_id).maybe_single().execute()
168
+ text_responses = {}
169
+ if text_result.data:
170
+ text_responses = {
171
+ 'text_q1': text_result.data.get('text_q1', ''),
172
+ 'text_q2': text_result.data.get('text_q2', ''),
173
+ 'text_q3': text_result.data.get('text_q3', '')
174
+ }
175
+
176
+ # Calculate scores
177
+ universal_score, universal_conf, _ = universal_module.score(student)
178
+ personality_score, personality_conf, _ = personality_module.score(personality_responses)
179
+ text_score, text_conf, _ = text_module.score(text_responses)
180
+
181
+ final_score, _ = fusion_engine.fuse_scores(
182
+ universal_score, universal_conf,
183
+ personality_score, personality_conf,
184
+ text_score, text_conf
185
+ )
186
+
187
+ leaderboard.append({
188
+ 'student_id': student_id,
189
+ 'final_score': round(final_score, 3),
190
+ 'grade': fusion_engine.get_grade(final_score)
191
+ })
192
+ except:
193
+ continue
194
+
195
+ # Sort by score
196
+ leaderboard.sort(key=lambda x: x['final_score'], reverse=True)
197
+
198
+ return jsonify({
199
+ 'success': True,
200
+ 'data': leaderboard
201
+ }), 200
202
+
203
+ except Exception as e:
204
+ return jsonify({'error': str(e)}), 500
routes/students.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Student management routes"""
2
+ from flask import Blueprint, request, jsonify
3
+ from database.db import get_db
4
+ from models.student import Student
5
+
6
+ students_bp = Blueprint('students', __name__)
7
+ db = get_db()
8
+
9
+ @students_bp.route('/students', methods=['POST'])
10
+ def create_student():
11
+ """Create a new student profile"""
12
+ try:
13
+ data = request.json
14
+ print(f"Received data: {data}") # Debug log
15
+
16
+ # Validate required fields
17
+ required = ['student_id', 'cgpa']
18
+ if not all(k in data for k in required):
19
+ return jsonify({'error': 'Missing required fields'}), 400
20
+
21
+ # Create student record
22
+ student = Student(
23
+ student_id=data['student_id'],
24
+ user_id=data.get('user_id'),
25
+ cgpa=data['cgpa'],
26
+ sgpa_sem1=data.get('sgpa_sem1'),
27
+ sgpa_sem2=data.get('sgpa_sem2'),
28
+ sgpa_sem3=data.get('sgpa_sem3'),
29
+ sgpa_sem4=data.get('sgpa_sem4'),
30
+ sgpa_sem5=data.get('sgpa_sem5'),
31
+ sgpa_sem6=data.get('sgpa_sem6'),
32
+ sgpa_sem7=data.get('sgpa_sem7'),
33
+ sgpa_sem8=data.get('sgpa_sem8'),
34
+ tenth_pct=data.get('tenth_pct'),
35
+ twelfth_pct=data.get('twelfth_pct'),
36
+ extracurricular_text=data.get('extracurricular_text'),
37
+ certifications_text=data.get('certifications_text'),
38
+ internship_text=data.get('internship_text')
39
+ )
40
+
41
+ print(f"Student object created: {student.to_dict()}") # Debug log
42
+
43
+ # Insert or update into database (upsert)
44
+ result = db.table('analytics_students').upsert(student.to_dict()).execute()
45
+
46
+ print(f"Database result: {result}") # Debug log
47
+
48
+ return jsonify({
49
+ 'success': True,
50
+ 'data': result.data[0] if result.data else None
51
+ }), 201
52
+
53
+ except Exception as e:
54
+ print(f"Error in create_student: {str(e)}") # Debug log
55
+ import traceback
56
+ traceback.print_exc()
57
+ return jsonify({'error': str(e)}), 500
58
+
59
+
60
+ @students_bp.route('/students', methods=['GET'])
61
+ def list_students():
62
+ """List all students"""
63
+ try:
64
+ result = db.table('analytics_students').select('*').execute()
65
+ return jsonify({
66
+ 'success': True,
67
+ 'data': result.data
68
+ }), 200
69
+ except Exception as e:
70
+ return jsonify({'error': str(e)}), 500
71
+
72
+
73
+ @students_bp.route('/students/<student_id>', methods=['GET'])
74
+ def get_student(student_id):
75
+ """Get a specific student"""
76
+ try:
77
+ result = db.table('analytics_students').select('*').eq('student_id', student_id).single().execute()
78
+
79
+ if not result.data:
80
+ return jsonify({'error': 'Student not found'}), 404
81
+
82
+ return jsonify({
83
+ 'success': True,
84
+ 'data': result.data
85
+ }), 200
86
+ except Exception as e:
87
+ return jsonify({'error': str(e)}), 500
88
+
89
+
90
+ @students_bp.route('/personality/<student_id>', methods=['POST'])
91
+ def submit_personality(student_id):
92
+ """Submit personality responses for a student"""
93
+ try:
94
+ data = request.json
95
+
96
+ # Validate student exists
97
+ student = db.table('analytics_students').select('student_id').eq('student_id', student_id).single().execute()
98
+ if not student.data:
99
+ return jsonify({'error': 'Student not found'}), 404
100
+
101
+ # Prepare personality data
102
+ personality_data = {'student_id': student_id}
103
+ for i in range(1, 21):
104
+ key = f'p_q{i}'
105
+ if key in data:
106
+ personality_data[key] = data[key]
107
+
108
+ # Insert or update
109
+ result = db.table('analytics_personality_responses').upsert(personality_data).execute()
110
+
111
+ return jsonify({
112
+ 'success': True,
113
+ 'data': result.data[0] if result.data else None
114
+ }), 201
115
+
116
+ except Exception as e:
117
+ return jsonify({'error': str(e)}), 500
118
+
119
+
120
+ @students_bp.route('/text/<student_id>', methods=['POST'])
121
+ def submit_text(student_id):
122
+ """Submit text responses for a student"""
123
+ try:
124
+ data = request.json
125
+
126
+ # Validate student exists
127
+ student = db.table('analytics_students').select('student_id').eq('student_id', student_id).single().execute()
128
+ if not student.data:
129
+ return jsonify({'error': 'Student not found'}), 404
130
+
131
+ # Validate required text fields
132
+ required = ['text_q1', 'text_q2', 'text_q3']
133
+ if not all(k in data for k in required):
134
+ return jsonify({'error': 'Missing required text fields'}), 400
135
+
136
+ # Prepare text data
137
+ text_data = {
138
+ 'student_id': student_id,
139
+ 'text_q1': data['text_q1'],
140
+ 'text_q2': data['text_q2'],
141
+ 'text_q3': data['text_q3']
142
+ }
143
+
144
+ # Insert or update
145
+ result = db.table('analytics_text_responses').upsert(text_data).execute()
146
+
147
+ return jsonify({
148
+ 'success': True,
149
+ 'data': result.data[0] if result.data else None
150
+ }), 201
151
+
152
+ except Exception as e:
153
+ return jsonify({'error': str(e)}), 500
154
+
155
+
156
+ @students_bp.route('/questions/personality', methods=['GET'])
157
+ def get_personality_questions():
158
+ """Get the 20 personality questions"""
159
+ from models.personality_responses import PersonalityResponses
160
+ return jsonify({
161
+ 'success': True,
162
+ 'data': PersonalityResponses.get_questions()
163
+ }), 200
164
+
165
+
166
+ @students_bp.route('/questions/text', methods=['GET'])
167
+ def get_text_questions():
168
+ """Get the 3 text questions"""
169
+ from models.text_responses import TextResponses
170
+ return jsonify({
171
+ 'success': True,
172
+ 'data': TextResponses.get_questions()
173
+ }), 200
services/README_text_v2.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Module V2 - Aspect-Based Scoring
2
+
3
+ ## Overview
4
+ Enhanced text analysis using prototype-based aspect extraction with `all-mpnet-base-v2` embeddings.
5
+
6
+ ## Changes from V1
7
+ - **Model**: Upgraded from `all-MiniLM-L6-v2` (384d) to `all-mpnet-base-v2` (768d)
8
+ - **Approach**: Moved from simple reference embeddings to aspect-based prototype scoring
9
+ - **Aspects**: 10 employability aspects (leadership, technical_skills, problem_solving, etc.)
10
+ - **Admin**: Runtime seed updates via REST API
11
+
12
+ ## Configuration
13
+
14
+ ### Model Selection
15
+ Set via environment variable or constructor:
16
+ ```bash
17
+ export ASPECT_MODEL_NAME=all-mpnet-base-v2 # default
18
+ # or
19
+ export ASPECT_MODEL_NAME=all-MiniLM-L6-v2 # fallback
20
+ ```
21
+
22
+ ```python
23
+ from services.text_module_v2 import TextModuleV2
24
+
25
+ # Default (all-mpnet-base-v2)
26
+ text_module = TextModuleV2()
27
+
28
+ # Override model
29
+ text_module = TextModuleV2(model_name='all-MiniLM-L6-v2')
30
+ ```
31
+
32
+ ### Aspect Seeds
33
+ Seeds loaded from `./aspect_seeds.json` (created by default). Edit this file to customize aspect definitions.
34
+
35
+ **Location**: `analytics/backend/aspect_seeds.json`
36
+
37
+ ### Centroids Cache
38
+ Pre-computed centroids saved to `./aspect_centroids.npz` for fast cold starts.
39
+
40
+ ## Usage
41
+
42
+ ### Basic Scoring
43
+ ```python
44
+ text_module = TextModuleV2()
45
+
46
+ text_responses = {
47
+ 'text_q1': "I developed ML pipelines using Python and scikit-learn...",
48
+ 'text_q2': "My career goal is to become a data scientist...",
49
+ 'text_q3': "I led a team of 5 students in a hackathon project..."
50
+ }
51
+
52
+ score, confidence, features = text_module.score(text_responses)
53
+
54
+ print(f"Score: {score:.2f}, Confidence: {confidence:.2f}")
55
+ print(f"Features: {features}")
56
+ ```
57
+
58
+ ### Get Current Seeds
59
+ ```python
60
+ seeds = text_module.get_aspect_seeds()
61
+ print(f"Loaded {len(seeds)} aspects")
62
+ ```
63
+
64
+ ## Admin API
65
+
66
+ ### Setup
67
+ ```python
68
+ from flask import Flask
69
+ from services.text_module_v2 import TextModuleV2, register_admin_seed_endpoint
70
+
71
+ app = Flask(__name__)
72
+ text_module = TextModuleV2()
73
+
74
+ # Register admin endpoints
75
+ register_admin_seed_endpoint(app, text_module)
76
+
77
+ app.run(port=5001)
78
+ ```
79
+
80
+ Set admin token:
81
+ ```bash
82
+ export ADMIN_SEED_TOKEN=your-secret-token
83
+ ```
84
+
85
+ ### Endpoints
86
+
87
+ #### GET /admin/aspect-seeds
88
+ Get current loaded seeds.
89
+
90
+ **Request**:
91
+ ```bash
92
+ curl -H "X-Admin-Token: your-secret-token" \
93
+ http://localhost:5001/admin/aspect-seeds
94
+ ```
95
+
96
+ **Response**:
97
+ ```json
98
+ {
99
+ "success": true,
100
+ "seeds": {
101
+ "leadership": ["led a team", "managed project", ...],
102
+ "technical_skills": [...]
103
+ },
104
+ "num_aspects": 10
105
+ }
106
+ ```
107
+
108
+ #### POST /admin/aspect-seeds
109
+ Update aspect seeds (recomputes centroids).
110
+
111
+ **Request**:
112
+ ```bash
113
+ curl -X POST \
114
+ -H "X-Admin-Token: your-secret-token" \
115
+ -H "Content-Type: application/json" \
116
+ -d '{
117
+ "seeds": {
118
+ "leadership": [
119
+ "led a team",
120
+ "managed stakeholders",
121
+ "organized events"
122
+ ],
123
+ "technical_skills": [
124
+ "developed web API",
125
+ "built ML models"
126
+ ]
127
+ },
128
+ "persist": true
129
+ }' \
130
+ http://localhost:5001/admin/aspect-seeds
131
+ ```
132
+
133
+ **Response**:
134
+ ```json
135
+ {
136
+ "success": true,
137
+ "message": "Aspect seeds updated successfully",
138
+ "stats": {
139
+ "num_aspects": 2,
140
+ "avg_seed_count": 2.5,
141
+ "timestamp": "2025-12-09T10:30:00Z"
142
+ }
143
+ }
144
+ ```
145
+
146
+ ## Advanced: Seed Expansion
147
+
148
+ Suggest new seed phrases from a corpus:
149
+
150
+ ```python
151
+ corpus = [
152
+ "I led the product development team and managed stakeholders",
153
+ "Implemented CI/CD pipelines for automated testing",
154
+ # ... more texts
155
+ ]
156
+
157
+ suggestions = text_module.suggest_seed_expansions(
158
+ corpus_texts=corpus,
159
+ aspect_key='leadership',
160
+ top_n=20
161
+ )
162
+
163
+ print("Suggested seeds:", suggestions)
164
+ ```
165
+
166
+ ## Aspect → Question Mapping
167
+
168
+ ```python
169
+ from services.text_module_v2 import get_relevant_aspects_for_question
170
+
171
+ # Q1: Strengths & skills
172
+ aspects_q1 = get_relevant_aspects_for_question('text_q1')
173
+ # ['technical_skills', 'problem_solving', 'learning_agility', 'initiative', 'communication']
174
+
175
+ # Q2: Career interests
176
+ aspects_q2 = get_relevant_aspects_for_question('text_q2')
177
+ # ['career_alignment', 'learning_agility', 'initiative', 'communication']
178
+
179
+ # Q3: Extracurriculars & leadership
180
+ aspects_q3 = get_relevant_aspects_for_question('text_q3')
181
+ # ['leadership', 'teamwork', 'project_execution', 'internships_experience', 'communication']
182
+ ```
183
+
184
+ ## Files
185
+
186
+ | File | Purpose |
187
+ |------|---------|
188
+ | `services/text_module_v2.py` | Main module implementation |
189
+ | `aspect_seeds.json` | Aspect seed definitions (editable) |
190
+ | `aspect_centroids.npz` | Cached centroids (auto-generated) |
191
+
192
+ ## Performance
193
+
194
+ - **Model Load**: ~3s (first time)
195
+ - **Centroid Build**: ~1s for 10 aspects with 20 seeds each
196
+ - **Text Scoring**: ~200-500ms per 3-question set (CPU)
197
+
198
+ ## Logging
199
+
200
+ Module logs to Python's `logging` system:
201
+ ```python
202
+ import logging
203
+ logging.basicConfig(level=logging.INFO)
204
+ ```
205
+
206
+ Key events logged:
207
+ - Model loading
208
+ - Seed updates (with masked token)
209
+ - Centroid recomputation
210
+ - File I/O operations
services/batch_aggregation.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Batch Aggregation Service - College-level macro analysis
3
+ Aggregates individual student scores into batch-level reports
4
+ """
5
+ import logging
6
+ import numpy as np
7
+ from typing import Dict, List, Any, Optional
8
+ from dataclasses import dataclass, asdict
9
+ from datetime import datetime
10
+ from collections import Counter
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class AggregateMetrics:
17
+ """Batch-level aggregate metrics"""
18
+ total_students: int
19
+ avg_employability_score: float
20
+ median_score: float
21
+ std_dev: float
22
+ placement_ready_pct: float # % with score >= 0.6
23
+ skill_diversity_index: int # Unique skills count
24
+ avg_cgpa: float
25
+ avg_internship_months: float
26
+
27
+
28
+ @dataclass
29
+ class AspectDistribution:
30
+ """Distribution stats for an aspect"""
31
+ aspect: str
32
+ avg: float
33
+ std: float
34
+ min: float
35
+ max: float
36
+ top_10_pct_avg: float # Avg of top 10%
37
+ bottom_10_pct_avg: float
38
+
39
+
40
+ @dataclass
41
+ class DomainBreakdown:
42
+ """Domain-wise student distribution"""
43
+ domain_id: str
44
+ display_name: str
45
+ count: int
46
+ percentage: float
47
+ avg_score: float
48
+
49
+
50
+ @dataclass
51
+ class SkillGap:
52
+ """Skill gap analysis result"""
53
+ skill: str
54
+ demand_score: float
55
+ students_with_skill: int
56
+ students_pct: float
57
+ gap_severity: str # 'critical', 'moderate', 'low'
58
+
59
+
60
+ @dataclass
61
+ class BatchRecommendation:
62
+ """Recommendation for batch improvement"""
63
+ category: str # 'curriculum', 'training', 'industry'
64
+ priority: str # 'high', 'medium', 'low'
65
+ recommendation: str
66
+ impact: str
67
+
68
+
69
+ class BatchAggregationService:
70
+ """
71
+ Aggregates individual student data into college-level macro reports
72
+ """
73
+
74
+ # Thresholds
75
+ PLACEMENT_READY_THRESHOLD = 0.60
76
+ CRITICAL_GAP_THRESHOLD = 0.30 # < 30% students have skill
77
+ MODERATE_GAP_THRESHOLD = 0.50
78
+
79
+ def __init__(self):
80
+ # Industry demand mapping (can be loaded from external source)
81
+ self.industry_demands = {
82
+ 'python': 0.90,
83
+ 'sql': 0.85,
84
+ 'java': 0.80,
85
+ 'javascript': 0.75,
86
+ 'machine_learning': 0.70,
87
+ 'cloud': 0.85,
88
+ 'devops': 0.75,
89
+ 'data_analysis': 0.70,
90
+ 'system_design': 0.65,
91
+ 'communication': 0.80,
92
+ 'leadership': 0.60,
93
+ 'teamwork': 0.75
94
+ }
95
+
96
+ def aggregate_batch(self,
97
+ students: List[Dict[str, Any]],
98
+ college_name: str = "Unknown College",
99
+ batch_year: int = None) -> Dict[str, Any]:
100
+ """
101
+ Generate comprehensive batch report from student data
102
+
103
+ Args:
104
+ students: List of student score packets (from scoring endpoint)
105
+ college_name: Name of the college
106
+ batch_year: Graduation year
107
+
108
+ Returns:
109
+ Complete macro analysis report
110
+ """
111
+ if not students:
112
+ return self._empty_report(college_name, batch_year)
113
+
114
+ batch_year = batch_year or datetime.now().year
115
+
116
+ # Extract scores and features
117
+ scores = []
118
+ cgpas = []
119
+ internship_months = []
120
+ all_skills = []
121
+ domain_counts = Counter()
122
+ aspect_scores = {
123
+ 'technical_skills': [],
124
+ 'problem_solving': [],
125
+ 'leadership': [],
126
+ 'communication': [],
127
+ 'teamwork': [],
128
+ 'learning_agility': []
129
+ }
130
+
131
+ for student in students:
132
+ # Final score
133
+ final_score = student.get('final_score', 0)
134
+ scores.append(final_score)
135
+
136
+ # Features
137
+ features = student.get('detailed_features', {})
138
+ universal = features.get('universal', {})
139
+ text = features.get('text', {})
140
+
141
+ cgpas.append(universal.get('cgpa_norm', 0) * 10) # Denormalize
142
+ internship_months.append(universal.get('internship_exposure', 0) * 12)
143
+
144
+ # Domain
145
+ domain = student.get('domain_type') or student.get('detected_domain', 'general')
146
+ domain_counts[domain] += 1
147
+
148
+ # Aspect scores
149
+ for aspect in aspect_scores:
150
+ if aspect in text:
151
+ aspect_scores[aspect].append(text[aspect])
152
+ elif aspect in universal:
153
+ aspect_scores[aspect].append(universal[aspect])
154
+
155
+ # Skills (from raw student data if available)
156
+ if 'skills' in student:
157
+ skills = student['skills']
158
+ if isinstance(skills, str):
159
+ skills = [s.strip().lower() for s in skills.split(',')]
160
+ all_skills.extend(skills)
161
+
162
+ # Compute aggregates
163
+ aggregate = self._compute_aggregate_metrics(
164
+ scores, cgpas, internship_months, all_skills
165
+ )
166
+
167
+ # Aspect distributions
168
+ aspects = self._compute_aspect_distributions(aspect_scores)
169
+
170
+ # Domain breakdown
171
+ domains = self._compute_domain_breakdown(domain_counts, students)
172
+
173
+ # Skill gaps
174
+ skill_gaps = self._analyze_skill_gaps(all_skills, len(students))
175
+
176
+ # Recommendations
177
+ recommendations = self._generate_recommendations(
178
+ aggregate, aspects, skill_gaps
179
+ )
180
+
181
+ # Build report
182
+ report = {
183
+ 'report_id': f"BATCH_{batch_year}_{college_name[:3].upper()}",
184
+ 'college_name': college_name,
185
+ 'batch_year': batch_year,
186
+ 'generated_at': datetime.utcnow().isoformat() + 'Z',
187
+ 'total_students': len(students),
188
+
189
+ 'aggregate_metrics': asdict(aggregate),
190
+
191
+ 'score_distribution': self._compute_score_distribution(scores),
192
+
193
+ 'aspect_analysis': [asdict(a) for a in aspects],
194
+
195
+ 'domain_breakdown': [asdict(d) for d in domains],
196
+
197
+ 'skill_gap_analysis': [asdict(g) for g in skill_gaps],
198
+
199
+ 'recommendations': [asdict(r) for r in recommendations],
200
+
201
+ 'percentile_bands': self._compute_percentile_bands(scores)
202
+ }
203
+
204
+ return report
205
+
206
+ def _compute_aggregate_metrics(self, scores, cgpas, internship_months,
207
+ skills) -> AggregateMetrics:
208
+ """Compute high-level aggregate metrics"""
209
+ scores_arr = np.array(scores)
210
+
211
+ placement_ready = sum(1 for s in scores if s >= self.PLACEMENT_READY_THRESHOLD)
212
+ placement_pct = (placement_ready / len(scores)) * 100 if scores else 0
213
+
214
+ return AggregateMetrics(
215
+ total_students=len(scores),
216
+ avg_employability_score=round(float(np.mean(scores_arr)), 3),
217
+ median_score=round(float(np.median(scores_arr)), 3),
218
+ std_dev=round(float(np.std(scores_arr)), 3),
219
+ placement_ready_pct=round(placement_pct, 1),
220
+ skill_diversity_index=len(set(skills)),
221
+ avg_cgpa=round(float(np.mean(cgpas)) if cgpas else 0, 2),
222
+ avg_internship_months=round(float(np.mean(internship_months)) if internship_months else 0, 1)
223
+ )
224
+
225
+ def _compute_aspect_distributions(self, aspect_scores) -> List[AspectDistribution]:
226
+ """Compute distribution stats for each aspect"""
227
+ distributions = []
228
+
229
+ for aspect, scores in aspect_scores.items():
230
+ if not scores:
231
+ continue
232
+
233
+ arr = np.array(scores)
234
+ top_10_idx = int(len(arr) * 0.1) or 1
235
+ bottom_10_idx = int(len(arr) * 0.1) or 1
236
+
237
+ sorted_arr = np.sort(arr)
238
+
239
+ distributions.append(AspectDistribution(
240
+ aspect=aspect,
241
+ avg=round(float(np.mean(arr)), 3),
242
+ std=round(float(np.std(arr)), 3),
243
+ min=round(float(np.min(arr)), 3),
244
+ max=round(float(np.max(arr)), 3),
245
+ top_10_pct_avg=round(float(np.mean(sorted_arr[-top_10_idx:])), 3),
246
+ bottom_10_pct_avg=round(float(np.mean(sorted_arr[:bottom_10_idx])), 3)
247
+ ))
248
+
249
+ return distributions
250
+
251
+ def _compute_domain_breakdown(self, domain_counts, students) -> List[DomainBreakdown]:
252
+ """Compute domain-wise breakdown"""
253
+ breakdowns = []
254
+ total = len(students)
255
+
256
+ for domain, count in domain_counts.most_common():
257
+ # Calculate avg score for this domain
258
+ domain_scores = [
259
+ s.get('final_score', 0) for s in students
260
+ if (s.get('domain_type') or s.get('detected_domain', 'general')) == domain
261
+ ]
262
+ avg_score = np.mean(domain_scores) if domain_scores else 0
263
+
264
+ breakdowns.append(DomainBreakdown(
265
+ domain_id=domain,
266
+ display_name=domain.replace('_', ' ').title(),
267
+ count=count,
268
+ percentage=round((count / total) * 100, 1),
269
+ avg_score=round(float(avg_score), 3)
270
+ ))
271
+
272
+ return breakdowns
273
+
274
+ def _analyze_skill_gaps(self, all_skills, total_students) -> List[SkillGap]:
275
+ """Analyze skill gaps against industry demand"""
276
+ skill_counts = Counter(all_skills)
277
+ gaps = []
278
+
279
+ for skill, demand in self.industry_demands.items():
280
+ count = skill_counts.get(skill, 0)
281
+ pct = (count / total_students) * 100 if total_students else 0
282
+
283
+ # Determine severity
284
+ if pct < self.CRITICAL_GAP_THRESHOLD * 100:
285
+ severity = 'critical'
286
+ elif pct < self.MODERATE_GAP_THRESHOLD * 100:
287
+ severity = 'moderate'
288
+ else:
289
+ severity = 'low'
290
+
291
+ gaps.append(SkillGap(
292
+ skill=skill,
293
+ demand_score=demand,
294
+ students_with_skill=count,
295
+ students_pct=round(pct, 1),
296
+ gap_severity=severity
297
+ ))
298
+
299
+ # Sort by demand * (1 - coverage)
300
+ gaps.sort(key=lambda g: g.demand_score * (1 - g.students_pct/100), reverse=True)
301
+
302
+ return gaps[:10] # Top 10 gaps
303
+
304
+ def _generate_recommendations(self, aggregate, aspects,
305
+ skill_gaps) -> List[BatchRecommendation]:
306
+ """Generate actionable recommendations"""
307
+ recommendations = []
308
+
309
+ # Critical skill gaps
310
+ critical_gaps = [g for g in skill_gaps if g.gap_severity == 'critical']
311
+ for gap in critical_gaps[:3]:
312
+ recommendations.append(BatchRecommendation(
313
+ category='curriculum',
314
+ priority='high',
315
+ recommendation=f"Add {gap.skill.replace('_', ' ').title()} training to curriculum",
316
+ impact=f"Only {gap.students_pct}% students have this in-demand skill"
317
+ ))
318
+
319
+ # Low placement readiness
320
+ if aggregate.placement_ready_pct < 60:
321
+ recommendations.append(BatchRecommendation(
322
+ category='training',
323
+ priority='high',
324
+ recommendation="Implement intensive placement preparation program",
325
+ impact=f"Only {aggregate.placement_ready_pct}% students are placement-ready"
326
+ ))
327
+
328
+ # Low internship exposure
329
+ if aggregate.avg_internship_months < 3:
330
+ recommendations.append(BatchRecommendation(
331
+ category='industry',
332
+ priority='medium',
333
+ recommendation="Establish mandatory internship partnerships with industry",
334
+ impact=f"Average internship exposure is only {aggregate.avg_internship_months} months"
335
+ ))
336
+
337
+ # Weak aspects
338
+ for aspect in aspects:
339
+ if aspect.avg < 0.5:
340
+ recommendations.append(BatchRecommendation(
341
+ category='training',
342
+ priority='medium',
343
+ recommendation=f"Conduct workshops on {aspect.aspect.replace('_', ' ').title()}",
344
+ impact=f"Average score is {aspect.avg:.0%}, below acceptable threshold"
345
+ ))
346
+
347
+ return recommendations[:8] # Limit to 8 recommendations
348
+
349
+ def _compute_score_distribution(self, scores) -> Dict[str, int]:
350
+ """Compute score distribution by grade bands"""
351
+ distribution = {
352
+ 'A+ (90-100%)': 0,
353
+ 'A (80-90%)': 0,
354
+ 'B+ (70-80%)': 0,
355
+ 'B (60-70%)': 0,
356
+ 'C (50-60%)': 0,
357
+ 'D (<50%)': 0
358
+ }
359
+
360
+ for score in scores:
361
+ pct = score * 100
362
+ if pct >= 90:
363
+ distribution['A+ (90-100%)'] += 1
364
+ elif pct >= 80:
365
+ distribution['A (80-90%)'] += 1
366
+ elif pct >= 70:
367
+ distribution['B+ (70-80%)'] += 1
368
+ elif pct >= 60:
369
+ distribution['B (60-70%)'] += 1
370
+ elif pct >= 50:
371
+ distribution['C (50-60%)'] += 1
372
+ else:
373
+ distribution['D (<50%)'] += 1
374
+
375
+ return distribution
376
+
377
+ def _compute_percentile_bands(self, scores) -> Dict[str, float]:
378
+ """Compute percentile thresholds"""
379
+ if not scores:
380
+ return {}
381
+
382
+ arr = np.array(scores)
383
+ return {
384
+ 'p10': round(float(np.percentile(arr, 10)), 3),
385
+ 'p25': round(float(np.percentile(arr, 25)), 3),
386
+ 'p50': round(float(np.percentile(arr, 50)), 3),
387
+ 'p75': round(float(np.percentile(arr, 75)), 3),
388
+ 'p90': round(float(np.percentile(arr, 90)), 3)
389
+ }
390
+
391
+ def _empty_report(self, college_name: str, batch_year: int) -> Dict[str, Any]:
392
+ """Generate empty report for no data"""
393
+ return {
394
+ 'report_id': f"BATCH_{batch_year or 'UNKNOWN'}_{college_name[:3].upper()}",
395
+ 'college_name': college_name,
396
+ 'batch_year': batch_year,
397
+ 'generated_at': datetime.utcnow().isoformat() + 'Z',
398
+ 'total_students': 0,
399
+ 'error': 'No student data provided',
400
+ 'aggregate_metrics': None,
401
+ 'recommendations': []
402
+ }
403
+
404
+
405
+ # Singleton
406
+ _batch_service: Optional[BatchAggregationService] = None
407
+
408
+
409
+ def get_batch_aggregation_service() -> BatchAggregationService:
410
+ """Get singleton batch aggregation service"""
411
+ global _batch_service
412
+ if _batch_service is None:
413
+ _batch_service = BatchAggregationService()
414
+ return _batch_service
services/domain_knowledge_base.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Domain Knowledge Base - Dynamic domain-specific aspect prototypes and skill mapping
3
+ """
4
+ import os
5
+ import json
6
+ import logging
7
+ from typing import Dict, List, Optional, Tuple
8
+ from pathlib import Path
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class DomainConfig:
14
+ """Single domain configuration"""
15
+
16
+ def __init__(self, config_data: Dict):
17
+ self.domain_id = config_data.get('domain_id', 'unknown')
18
+ self.display_name = config_data.get('display_name', 'Unknown Domain')
19
+ self.description = config_data.get('description', '')
20
+ self.core_skills = config_data.get('core_skills', [])
21
+ self.aspect_prototypes = config_data.get('aspect_prototypes', {})
22
+ self.industry_benchmarks = config_data.get('industry_benchmarks', {})
23
+ self.skill_gaps_mapping = config_data.get('skill_gaps_mapping', {})
24
+ self.detection_keywords = config_data.get('detection_keywords', [])
25
+
26
+ def get_aspect_seeds(self, aspect: str) -> List[str]:
27
+ """Get seed phrases for a specific aspect"""
28
+ return self.aspect_prototypes.get(aspect, [])
29
+
30
+ def get_all_aspect_seeds(self) -> Dict[str, List[str]]:
31
+ """Get all aspect prototypes"""
32
+ return self.aspect_prototypes.copy()
33
+
34
+ def get_skill_gap_info(self, skill: str) -> Optional[Dict]:
35
+ """Get skill gap information including courses and certs"""
36
+ return self.skill_gaps_mapping.get(skill)
37
+
38
+ def get_benchmark(self, key: str, default=None):
39
+ """Get industry benchmark value"""
40
+ return self.industry_benchmarks.get(key, default)
41
+
42
+
43
+ class DomainKnowledgeBase:
44
+ """
45
+ Domain Knowledge Base - loads and manages domain configurations
46
+ Provides domain-specific aspect prototypes for the Fidelity Criteria Transformer
47
+ """
48
+
49
+ def __init__(self, domains_dir: str = None):
50
+ """
51
+ Initialize DKB with domain configs from directory
52
+
53
+ Args:
54
+ domains_dir: Path to directory containing domain JSON files
55
+ Defaults to ./domains/ relative to this file
56
+ """
57
+ if domains_dir is None:
58
+ domains_dir = os.path.join(os.path.dirname(__file__), 'domains')
59
+
60
+ self.domains_dir = Path(domains_dir)
61
+ self.domains: Dict[str, DomainConfig] = {}
62
+ self._keyword_index: Dict[str, str] = {} # keyword -> domain_id
63
+
64
+ self._load_all_domains()
65
+ self._build_keyword_index()
66
+
67
+ logger.info(f"DomainKnowledgeBase initialized with {len(self.domains)} domains")
68
+
69
+ def _load_all_domains(self):
70
+ """Load all domain configs from directory"""
71
+ if not self.domains_dir.exists():
72
+ logger.warning(f"Domains directory not found: {self.domains_dir}")
73
+ return
74
+
75
+ for json_file in self.domains_dir.glob('*.json'):
76
+ try:
77
+ with open(json_file, 'r', encoding='utf-8') as f:
78
+ config_data = json.load(f)
79
+
80
+ domain_config = DomainConfig(config_data)
81
+ self.domains[domain_config.domain_id] = domain_config
82
+ logger.info(f"Loaded domain config: {domain_config.display_name}")
83
+
84
+ except Exception as e:
85
+ logger.error(f"Failed to load domain config {json_file}: {e}")
86
+
87
+ def _build_keyword_index(self):
88
+ """Build keyword -> domain mapping for detection"""
89
+ for domain_id, config in self.domains.items():
90
+ for keyword in config.detection_keywords:
91
+ self._keyword_index[keyword.lower()] = domain_id
92
+
93
+ def get_domain(self, domain_id: str) -> Optional[DomainConfig]:
94
+ """Get domain config by ID"""
95
+ return self.domains.get(domain_id)
96
+
97
+ def list_domains(self) -> List[str]:
98
+ """List all available domain IDs"""
99
+ return list(self.domains.keys())
100
+
101
+ def detect_domain(self, text: str, skills: List[str] = None) -> Tuple[str, float]:
102
+ """
103
+ Detect most likely domain from text and/or skills
104
+
105
+ Args:
106
+ text: Text content (career goals, descriptions, etc.)
107
+ skills: List of skill keywords
108
+
109
+ Returns:
110
+ (domain_id, confidence) tuple
111
+ """
112
+ if not text and not skills:
113
+ return ('general', 0.0)
114
+
115
+ text_lower = (text or '').lower()
116
+ skills_lower = [s.lower() for s in (skills or [])]
117
+
118
+ domain_scores = {}
119
+
120
+ for domain_id, config in self.domains.items():
121
+ score = 0.0
122
+
123
+ # Keyword matching from text
124
+ for keyword in config.detection_keywords:
125
+ if keyword.lower() in text_lower:
126
+ score += 0.1
127
+
128
+ # Skill matching
129
+ core_skills_lower = [s.lower() for s in config.core_skills]
130
+ skill_matches = sum(1 for s in skills_lower if s in core_skills_lower)
131
+ score += skill_matches * 0.15
132
+
133
+ domain_scores[domain_id] = min(score, 1.0)
134
+
135
+ if not domain_scores:
136
+ return ('general', 0.0)
137
+
138
+ # Return domain with highest score
139
+ best_domain = max(domain_scores, key=domain_scores.get)
140
+ confidence = domain_scores[best_domain]
141
+
142
+ # Minimum confidence threshold
143
+ if confidence < 0.2:
144
+ return ('general', confidence)
145
+
146
+ return (best_domain, confidence)
147
+
148
+ def get_aspect_prototypes_for_domain(self, domain_id: str) -> Dict[str, List[str]]:
149
+ """Get all aspect prototypes for a domain"""
150
+ config = self.domains.get(domain_id)
151
+ if config:
152
+ return config.get_all_aspect_seeds()
153
+ return {}
154
+
155
+ def get_merged_prototypes(self, detected_domain: str,
156
+ base_aspects: Dict[str, List[str]]) -> Dict[str, List[str]]:
157
+ """
158
+ Merge domain-specific prototypes with base aspects
159
+ Domain-specific seeds are added to base seeds
160
+
161
+ Args:
162
+ detected_domain: Domain ID from detection
163
+ base_aspects: Base aspect seeds (from TextModuleV2 defaults)
164
+
165
+ Returns:
166
+ Merged aspect seeds dictionary
167
+ """
168
+ merged = {k: list(v) for k, v in base_aspects.items()} # Deep copy
169
+
170
+ domain_config = self.domains.get(detected_domain)
171
+ if not domain_config:
172
+ return merged
173
+
174
+ # Merge domain-specific prototypes
175
+ for aspect, seeds in domain_config.aspect_prototypes.items():
176
+ if aspect in merged:
177
+ # Prepend domain-specific seeds (higher priority)
178
+ merged[aspect] = seeds + merged[aspect]
179
+ else:
180
+ merged[aspect] = seeds
181
+
182
+ return merged
183
+
184
+ def analyze_skill_gaps(self, student_skills: List[str],
185
+ domain_id: str) -> List[Dict]:
186
+ """
187
+ Analyze skill gaps for a student in a given domain
188
+
189
+ Args:
190
+ student_skills: List of skills the student has
191
+ domain_id: Target domain
192
+
193
+ Returns:
194
+ List of skill gap objects with recommendations
195
+ """
196
+ config = self.domains.get(domain_id)
197
+ if not config:
198
+ return []
199
+
200
+ student_skills_lower = [s.lower() for s in student_skills]
201
+ gaps = []
202
+
203
+ for skill, gap_info in config.skill_gaps_mapping.items():
204
+ skill_lower = skill.lower()
205
+
206
+ # Check if student has this skill
207
+ has_skill = any(skill_lower in s or s in skill_lower
208
+ for s in student_skills_lower)
209
+
210
+ if not has_skill:
211
+ gaps.append({
212
+ 'skill': skill,
213
+ 'demand_score': gap_info.get('demand_score', 0.5),
214
+ 'recommended_courses': gap_info.get('courses', []),
215
+ 'certifications': gap_info.get('certifications', []),
216
+ 'priority': 'high' if gap_info.get('demand_score', 0) > 0.7 else 'medium'
217
+ })
218
+
219
+ # Sort by demand score
220
+ gaps.sort(key=lambda x: x['demand_score'], reverse=True)
221
+ return gaps
222
+
223
+ def get_domain_summary(self, domain_id: str) -> Optional[Dict]:
224
+ """Get summary of a domain for reporting"""
225
+ config = self.domains.get(domain_id)
226
+ if not config:
227
+ return None
228
+
229
+ return {
230
+ 'domain_id': config.domain_id,
231
+ 'display_name': config.display_name,
232
+ 'description': config.description,
233
+ 'core_skills_count': len(config.core_skills),
234
+ 'aspects_count': len(config.aspect_prototypes),
235
+ 'benchmarks': config.industry_benchmarks
236
+ }
237
+
238
+
239
+ # Singleton instance
240
+ _dkb_instance: Optional[DomainKnowledgeBase] = None
241
+
242
+
243
+ def get_domain_knowledge_base(domains_dir: str = None) -> DomainKnowledgeBase:
244
+ """Get or create singleton DomainKnowledgeBase instance"""
245
+ global _dkb_instance
246
+
247
+ if _dkb_instance is None:
248
+ _dkb_instance = DomainKnowledgeBase(domains_dir)
249
+
250
+ return _dkb_instance
services/domain_plugins/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Domain-Specific Plugins Module
2
+
3
+ Pluggable architecture for domain-specific scoring (Tech, Business, Creative, Research).
4
+ Each plugin returns domain_score, domain_confidence, and raw features.
5
+ """
6
+
7
+ from .base_plugin import BaseDomainPlugin
8
+ from .plugin_factory import DomainPluginFactory
9
+
10
+ __all__ = ['BaseDomainPlugin', 'DomainPluginFactory']
services/domain_plugins/base_plugin.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base Domain Plugin Interface"""
2
+ from abc import ABC, abstractmethod
3
+ from typing import Dict, Tuple, List, Optional
4
+ from dataclasses import dataclass
5
+
6
+ @dataclass
7
+ class DomainScore:
8
+ """Standardized domain scoring output"""
9
+ domain_type: str
10
+ score: float # 0-1
11
+ confidence: float # 0-1
12
+ raw_features: Dict # Raw feature values for explainability
13
+ processing_time_ms: float
14
+
15
+ def to_dict(self):
16
+ return {
17
+ 'domain_type': self.domain_type,
18
+ 'score': round(self.score, 3),
19
+ 'confidence': round(self.confidence, 3),
20
+ 'raw_features': self.raw_features,
21
+ 'processing_time_ms': round(self.processing_time_ms, 2)
22
+ }
23
+
24
+
25
+ class BaseDomainPlugin(ABC):
26
+ """Abstract base class for all domain plugins"""
27
+
28
+ def __init__(self):
29
+ self.domain_type = self._get_domain_type()
30
+ self.feature_weights = self._get_feature_weights()
31
+
32
+ @abstractmethod
33
+ def _get_domain_type(self) -> str:
34
+ """Return domain identifier (e.g., 'tech', 'business')"""
35
+ pass
36
+
37
+ @abstractmethod
38
+ def _get_feature_weights(self) -> Dict[str, float]:
39
+ """Return feature name to weight mapping"""
40
+ pass
41
+
42
+ @abstractmethod
43
+ def get_required_fields(self) -> List[str]:
44
+ """Return list of required input fields for this domain"""
45
+ pass
46
+
47
+ @abstractmethod
48
+ def get_optional_fields(self) -> List[str]:
49
+ """Return list of optional input fields"""
50
+ pass
51
+
52
+ def validate_inputs(self, evidence_data: Dict) -> Tuple[bool, Optional[str]]:
53
+ """
54
+ Validate input data completeness
55
+ Returns: (is_valid, error_message)
56
+ """
57
+ required = self.get_required_fields()
58
+ missing = [f for f in required if not evidence_data.get(f)]
59
+
60
+ if missing:
61
+ return False, f"Missing required fields: {', '.join(missing)}"
62
+
63
+ return True, None
64
+
65
+ @abstractmethod
66
+ def score(self, evidence_data: Dict) -> DomainScore:
67
+ """
68
+ Main scoring method - must be implemented by each plugin
69
+
70
+ Args:
71
+ evidence_data: Dictionary containing domain-specific inputs
72
+
73
+ Returns:
74
+ DomainScore object with score, confidence, and features
75
+ """
76
+ pass
77
+
78
+ def explain(self, features: Dict) -> Dict:
79
+ """Generate human-readable explanation of scoring"""
80
+ explanations = {
81
+ 'top_features': [],
82
+ 'recommendations': []
83
+ }
84
+
85
+ # Sort features by value
86
+ sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True)
87
+
88
+ # Top 3 features
89
+ for feat, val in sorted_features[:3]:
90
+ if val > 0.3:
91
+ explanations['top_features'].append({
92
+ 'feature': feat,
93
+ 'value': round(val, 2),
94
+ 'weight': self.feature_weights.get(feat, 0)
95
+ })
96
+
97
+ return explanations
98
+
99
+ def calculate_confidence(self, evidence_data: Dict) -> float:
100
+ """
101
+ Calculate confidence based on data completeness and quality
102
+ Returns: 0-1 confidence score
103
+ """
104
+ required_fields = self.get_required_fields()
105
+ optional_fields = self.get_optional_fields()
106
+ total_fields = len(required_fields) + len(optional_fields)
107
+
108
+ filled_required = sum(1 for f in required_fields if evidence_data.get(f))
109
+ filled_optional = sum(1 for f in optional_fields if evidence_data.get(f))
110
+
111
+ # Base confidence from required fields (70%)
112
+ required_confidence = (filled_required / len(required_fields)) * 0.7 if required_fields else 0.7
113
+
114
+ # Bonus from optional fields (30%)
115
+ optional_confidence = (filled_optional / len(optional_fields)) * 0.3 if optional_fields else 0.3
116
+
117
+ return min(required_confidence + optional_confidence, 1.0)
services/domain_plugins/business_plugin.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Business/Finance Domain Plugin
2
+
3
+ Scores business competency based on:
4
+ - Resume content (ATS-style keyword matching)
5
+ - Case study submission analysis
6
+ - Excel/analytical test scores
7
+ - Internship experience in business domains
8
+ """
9
+ import re
10
+ import time
11
+ import logging
12
+ from typing import Dict, List
13
+ from .base_plugin import BaseDomainPlugin, DomainScore
14
+ from .plugin_factory import register_plugin
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @register_plugin('business')
20
+ class BusinessPlugin(BaseDomainPlugin):
21
+ """Business/Finance domain scoring plugin"""
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+ # Business-relevant keywords
26
+ self.business_keywords = {
27
+ 'consulting': ['consulting', 'consultant', 'advisory', 'strategy', 'mckinsey', 'bain', 'bcg'],
28
+ 'finance': ['finance', 'banking', 'investment', 'equity', 'portfolio', 'analyst', 'goldman', 'morgan'],
29
+ 'analytics': ['data analysis', 'business intelligence', 'tableau', 'power bi', 'sql', 'excel'],
30
+ 'management': ['project management', 'product management', 'stakeholder', 'agile', 'scrum'],
31
+ 'sales': ['sales', 'business development', 'client acquisition', 'revenue', 'crm'],
32
+ 'operations': ['operations', 'supply chain', 'logistics', 'process improvement', 'lean', 'six sigma']
33
+ }
34
+
35
+ def _get_domain_type(self) -> str:
36
+ return 'business'
37
+
38
+ def _get_feature_weights(self) -> Dict[str, float]:
39
+ return {
40
+ 'resume_keyword_score': 0.30,
41
+ 'internship_relevance': 0.25,
42
+ 'case_study_score': 0.20,
43
+ 'excel_test_score': 0.15,
44
+ 'business_depth': 0.10
45
+ }
46
+
47
+ def get_required_fields(self) -> List[str]:
48
+ return ['resume_text'] # Resume text (extracted from PDF)
49
+
50
+ def get_optional_fields(self) -> List[str]:
51
+ return ['case_study_text', 'excel_test_score', 'internship_descriptions']
52
+
53
+ def score(self, evidence_data: Dict) -> DomainScore:
54
+ """Calculate business domain score"""
55
+ start_time = time.time()
56
+ features = {}
57
+
58
+ # Resume keyword analysis
59
+ resume_text = evidence_data.get('resume_text', '')
60
+ if resume_text:
61
+ features['resume_keyword_score'] = self._analyze_resume_keywords(resume_text)
62
+ features['internship_relevance'] = self._extract_internship_relevance(resume_text)
63
+ features['business_depth'] = self._assess_business_depth(resume_text)
64
+ else:
65
+ features['resume_keyword_score'] = 0.0
66
+ features['internship_relevance'] = 0.0
67
+ features['business_depth'] = 0.0
68
+
69
+ # Case study analysis
70
+ case_study = evidence_data.get('case_study_text', '')
71
+ if case_study:
72
+ features['case_study_score'] = self._analyze_case_study(case_study)
73
+ else:
74
+ features['case_study_score'] = 0.0
75
+
76
+ # Excel test score (normalized 0-100 to 0-1)
77
+ excel_score = evidence_data.get('excel_test_score', 0)
78
+ features['excel_test_score'] = min(excel_score / 100, 1.0) if excel_score else 0.0
79
+
80
+ # Calculate weighted score
81
+ score = sum(features[k] * self.feature_weights[k] for k in features.keys())
82
+
83
+ # Calculate confidence
84
+ confidence = self.calculate_confidence(evidence_data)
85
+
86
+ processing_time = (time.time() - start_time) * 1000
87
+
88
+ return DomainScore(
89
+ domain_type='business',
90
+ score=min(score, 1.0),
91
+ confidence=confidence,
92
+ raw_features=features,
93
+ processing_time_ms=processing_time
94
+ )
95
+
96
+ def _analyze_resume_keywords(self, resume_text: str) -> float:
97
+ """
98
+ ATS-style keyword matching for business roles
99
+ Returns: 0-1 score based on keyword density and relevance
100
+ """
101
+ text_lower = resume_text.lower()
102
+
103
+ # Count keywords in each category
104
+ category_scores = {}
105
+ for category, keywords in self.business_keywords.items():
106
+ matches = sum(1 for kw in keywords if kw in text_lower)
107
+ category_scores[category] = min(matches / len(keywords), 1.0)
108
+
109
+ # Average across categories with some categories weighted more
110
+ weights = {
111
+ 'consulting': 0.20,
112
+ 'finance': 0.20,
113
+ 'analytics': 0.20,
114
+ 'management': 0.15,
115
+ 'sales': 0.15,
116
+ 'operations': 0.10
117
+ }
118
+
119
+ score = sum(category_scores.get(cat, 0) * weight for cat, weight in weights.items())
120
+
121
+ logger.info(f"Resume keyword score: {score:.2f} (categories: {category_scores})")
122
+ return score
123
+
124
+ def _extract_internship_relevance(self, resume_text: str) -> float:
125
+ """
126
+ Extract and score internship relevance to business
127
+ Returns: 0-1 score based on business-related internships
128
+ """
129
+ text_lower = resume_text.lower()
130
+
131
+ # Internship indicators
132
+ internship_patterns = [
133
+ r'intern(?:ship)?\s+at\s+([^\n]+)',
134
+ r'(?:summer|winter)\s+intern',
135
+ r'([a-z\s]+)\s+intern'
136
+ ]
137
+
138
+ internship_mentions = []
139
+ for pattern in internship_patterns:
140
+ matches = re.findall(pattern, text_lower)
141
+ internship_mentions.extend(matches)
142
+
143
+ if not internship_mentions:
144
+ return 0.0
145
+
146
+ # Score based on business keyword overlap in internship context
147
+ business_internship_score = 0.0
148
+ for mention in internship_mentions[:5]: # Top 5 internships
149
+ mention_text = mention if isinstance(mention, str) else ' '.join(mention)
150
+ for category, keywords in self.business_keywords.items():
151
+ if any(kw in mention_text for kw in keywords):
152
+ business_internship_score += 0.2
153
+
154
+ score = min(business_internship_score, 1.0)
155
+ logger.info(f"Internship relevance: {score:.2f}")
156
+ return score
157
+
158
+ def _assess_business_depth(self, resume_text: str) -> float:
159
+ """
160
+ Assess overall business knowledge depth
161
+ Returns: 0-1 score based on technical business terms
162
+ """
163
+ text_lower = resume_text.lower()
164
+
165
+ # Advanced business terms
166
+ advanced_terms = [
167
+ 'financial modeling', 'valuation', 'dcf', 'market research',
168
+ 'competitive analysis', 'business plan', 'roi', 'kpi',
169
+ 'p&l', 'balance sheet', 'cash flow', 'stakeholder management',
170
+ 'go-to-market', 'pricing strategy', 'market segmentation'
171
+ ]
172
+
173
+ term_count = sum(1 for term in advanced_terms if term in text_lower)
174
+ score = min(term_count / 10, 1.0) # 10+ terms = max
175
+
176
+ logger.info(f"Business depth score: {score:.2f} ({term_count} advanced terms)")
177
+ return score
178
+
179
+ def _analyze_case_study(self, case_study_text: str) -> float:
180
+ """
181
+ Analyze case study submission quality
182
+ Returns: 0-1 score based on structure and depth
183
+ """
184
+ if not case_study_text or len(case_study_text) < 100:
185
+ return 0.0
186
+
187
+ score = 0.0
188
+ text_lower = case_study_text.lower()
189
+
190
+ # Structure indicators
191
+ structure_keywords = ['problem', 'analysis', 'solution', 'recommendation', 'conclusion']
192
+ structure_score = sum(0.1 for kw in structure_keywords if kw in text_lower)
193
+ score += min(structure_score, 0.4)
194
+
195
+ # Analytical depth
196
+ analytical_terms = ['data', 'metric', 'assumption', 'framework', 'hypothesis', 'evidence']
197
+ analytical_score = sum(0.05 for term in analytical_terms if term in text_lower)
198
+ score += min(analytical_score, 0.3)
199
+
200
+ # Length (quality proxy)
201
+ length_score = min(len(case_study_text) / 2000, 0.3) # 2000+ chars = max
202
+ score += length_score
203
+
204
+ logger.info(f"Case study score: {score:.2f}")
205
+ return min(score, 1.0)
services/domain_plugins/creative_plugin.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Creative/Design Domain Plugin
2
+
3
+ Scores creative competency based on:
4
+ - Portfolio links (Behance, Dribbble, personal site)
5
+ - Project diversity and quality
6
+ - Design tool proficiency
7
+ - Visual content analysis
8
+ """
9
+ import re
10
+ import time
11
+ import logging
12
+ import requests
13
+ from typing import Dict, List
14
+ from .base_plugin import BaseDomainPlugin, DomainScore
15
+ from .plugin_factory import register_plugin
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @register_plugin('creative')
21
+ class CreativePlugin(BaseDomainPlugin):
22
+ """Creative/Design domain scoring plugin"""
23
+
24
+ def __init__(self):
25
+ super().__init__()
26
+ # Design tools and platforms
27
+ self.design_tools = [
28
+ 'figma', 'sketch', 'adobe xd', 'photoshop', 'illustrator',
29
+ 'after effects', 'premiere pro', 'blender', 'cinema 4d'
30
+ ]
31
+ self.portfolio_platforms = ['behance', 'dribbble', 'artstation', 'deviantart']
32
+
33
+ def _get_domain_type(self) -> str:
34
+ return 'creative'
35
+
36
+ def _get_feature_weights(self) -> Dict[str, float]:
37
+ return {
38
+ 'portfolio_quality': 0.35,
39
+ 'project_diversity': 0.25,
40
+ 'tool_proficiency': 0.20,
41
+ 'platform_presence': 0.15,
42
+ 'description_depth': 0.05
43
+ }
44
+
45
+ def get_required_fields(self) -> List[str]:
46
+ return ['portfolio_url']
47
+
48
+ def get_optional_fields(self) -> List[str]:
49
+ return ['behance_url', 'dribbble_url', 'design_tools_text', 'project_description']
50
+
51
+ def score(self, evidence_data: Dict) -> DomainScore:
52
+ """Calculate creative domain score"""
53
+ start_time = time.time()
54
+ features = {}
55
+
56
+ # Portfolio analysis
57
+ portfolio_url = evidence_data.get('portfolio_url', '')
58
+ if portfolio_url:
59
+ features['portfolio_quality'] = self._analyze_portfolio_quality(portfolio_url)
60
+ else:
61
+ features['portfolio_quality'] = 0.0
62
+
63
+ # Platform presence
64
+ behance_url = evidence_data.get('behance_url', '')
65
+ dribbble_url = evidence_data.get('dribbble_url', '')
66
+ features['platform_presence'] = self._check_platform_presence(behance_url, dribbble_url)
67
+
68
+ # Tool proficiency
69
+ tools_text = evidence_data.get('design_tools_text', '')
70
+ features['tool_proficiency'] = self._assess_tool_proficiency(tools_text)
71
+
72
+ # Project diversity and description
73
+ project_desc = evidence_data.get('project_description', '')
74
+ features['project_diversity'] = self._assess_project_diversity(project_desc)
75
+ features['description_depth'] = self._assess_description_depth(project_desc)
76
+
77
+ # Calculate weighted score
78
+ score = sum(features[k] * self.feature_weights[k] for k in features.keys())
79
+
80
+ # Calculate confidence
81
+ confidence = self.calculate_confidence(evidence_data)
82
+
83
+ processing_time = (time.time() - start_time) * 1000
84
+
85
+ return DomainScore(
86
+ domain_type='creative',
87
+ score=min(score, 1.0),
88
+ confidence=confidence,
89
+ raw_features=features,
90
+ processing_time_ms=processing_time
91
+ )
92
+
93
+ def _analyze_portfolio_quality(self, portfolio_url: str) -> float:
94
+ """
95
+ Analyze portfolio website quality
96
+ Returns: 0-1 score based on accessibility and professionalism
97
+ """
98
+ try:
99
+ if not portfolio_url.startswith(('http://', 'https://')):
100
+ portfolio_url = 'https://' + portfolio_url
101
+
102
+ response = requests.head(portfolio_url, timeout=5, allow_redirects=True)
103
+
104
+ if response.status_code == 200:
105
+ score = 0.6 # Base score for accessible portfolio
106
+
107
+ # Bonus for professional platforms
108
+ if any(platform in portfolio_url for platform in self.portfolio_platforms):
109
+ score += 0.2
110
+
111
+ # Bonus for custom domain
112
+ if not any(free in portfolio_url for free in ['github.io', 'wixsite', 'wordpress.com']):
113
+ score += 0.2
114
+
115
+ logger.info(f"Portfolio quality: {score:.2f}")
116
+ return min(score, 1.0)
117
+ else:
118
+ return 0.2
119
+
120
+ except Exception as e:
121
+ logger.error(f"Error analyzing portfolio: {e}")
122
+ return 0.2
123
+
124
+ def _check_platform_presence(self, behance_url: str, dribbble_url: str) -> float:
125
+ """
126
+ Check presence on design platforms
127
+ Returns: 0-1 score based on platform profiles
128
+ """
129
+ score = 0.0
130
+
131
+ # Behance presence
132
+ if behance_url and 'behance.net' in behance_url:
133
+ try:
134
+ response = requests.head(behance_url, timeout=5, allow_redirects=True)
135
+ if response.status_code == 200:
136
+ score += 0.5
137
+ except:
138
+ score += 0.2 # Partial credit for providing URL
139
+
140
+ # Dribbble presence
141
+ if dribbble_url and 'dribbble.com' in dribbble_url:
142
+ try:
143
+ response = requests.head(dribbble_url, timeout=5, allow_redirects=True)
144
+ if response.status_code == 200:
145
+ score += 0.5
146
+ except:
147
+ score += 0.2
148
+
149
+ logger.info(f"Platform presence: {score:.2f}")
150
+ return min(score, 1.0)
151
+
152
+ def _assess_tool_proficiency(self, tools_text: str) -> float:
153
+ """
154
+ Assess design tool proficiency
155
+ Returns: 0-1 score based on tool mentions
156
+ """
157
+ if not tools_text:
158
+ return 0.0
159
+
160
+ text_lower = tools_text.lower()
161
+
162
+ # Count tool mentions
163
+ tool_count = sum(1 for tool in self.design_tools if tool in text_lower)
164
+
165
+ # Score based on tool diversity
166
+ score = min(tool_count / 5, 1.0) # 5+ tools = max
167
+
168
+ # Bonus for professional tools (Adobe, Figma)
169
+ pro_tools = ['figma', 'adobe', 'sketch']
170
+ if any(tool in text_lower for tool in pro_tools):
171
+ score = min(score + 0.2, 1.0)
172
+
173
+ logger.info(f"Tool proficiency: {score:.2f} ({tool_count} tools)")
174
+ return score
175
+
176
+ def _assess_project_diversity(self, project_desc: str) -> float:
177
+ """
178
+ Assess project type diversity
179
+ Returns: 0-1 score based on project variety
180
+ """
181
+ if not project_desc:
182
+ return 0.0
183
+
184
+ text_lower = project_desc.lower()
185
+
186
+ # Project type categories
187
+ project_types = [
188
+ 'ui design', 'ux design', 'branding', 'logo', 'illustration',
189
+ 'animation', '3d', 'web design', 'mobile app', 'poster',
190
+ 'packaging', 'typography', 'infographic', 'video editing'
191
+ ]
192
+
193
+ type_count = sum(1 for ptype in project_types if ptype in text_lower)
194
+ score = min(type_count / 6, 1.0) # 6+ types = max
195
+
196
+ logger.info(f"Project diversity: {score:.2f} ({type_count} types)")
197
+ return score
198
+
199
+ def _assess_description_depth(self, project_desc: str) -> float:
200
+ """
201
+ Assess depth of project descriptions
202
+ Returns: 0-1 score based on detail level
203
+ """
204
+ if not project_desc or len(project_desc) < 50:
205
+ return 0.0
206
+
207
+ score = min(len(project_desc) / 1000, 1.0) # 1000+ chars = max
208
+
209
+ logger.info(f"Description depth: {score:.2f}")
210
+ return score
services/domain_plugins/plugin_factory.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Plugin Factory for Domain-Specific Scoring"""
2
+ import logging
3
+ from typing import Dict, Optional, List
4
+ from .base_plugin import BaseDomainPlugin
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class DomainPluginFactory:
10
+ """Factory for registering and instantiating domain plugins"""
11
+
12
+ _plugins: Dict[str, type] = {}
13
+ _instances: Dict[str, BaseDomainPlugin] = {} # Singleton instances
14
+
15
+ @classmethod
16
+ def register(cls, domain_type: str, plugin_class: type):
17
+ """Register a plugin class"""
18
+ if not issubclass(plugin_class, BaseDomainPlugin):
19
+ raise TypeError(f"{plugin_class} must inherit from BaseDomainPlugin")
20
+
21
+ cls._plugins[domain_type] = plugin_class
22
+ logger.info(f"Registered domain plugin: {domain_type}")
23
+
24
+ @classmethod
25
+ def get_plugin(cls, domain_type: str) -> Optional[BaseDomainPlugin]:
26
+ """Get plugin instance (singleton pattern)"""
27
+ if domain_type not in cls._plugins:
28
+ logger.warning(f"Plugin not found: {domain_type}")
29
+ return None
30
+
31
+ # Return cached instance or create new one
32
+ if domain_type not in cls._instances:
33
+ cls._instances[domain_type] = cls._plugins[domain_type]()
34
+
35
+ return cls._instances[domain_type]
36
+
37
+ @classmethod
38
+ def list_available_domains(cls) -> List[str]:
39
+ """List all registered domain types"""
40
+ return list(cls._plugins.keys())
41
+
42
+ @classmethod
43
+ def is_domain_available(cls, domain_type: str) -> bool:
44
+ """Check if domain plugin is registered"""
45
+ return domain_type in cls._plugins
46
+
47
+ @classmethod
48
+ def get_domain_info(cls, domain_type: str) -> Optional[Dict]:
49
+ """Get domain plugin information"""
50
+ plugin = cls.get_plugin(domain_type)
51
+ if not plugin:
52
+ return None
53
+
54
+ return {
55
+ 'domain_type': plugin.domain_type,
56
+ 'required_fields': plugin.get_required_fields(),
57
+ 'optional_fields': plugin.get_optional_fields(),
58
+ 'feature_weights': plugin.feature_weights
59
+ }
60
+
61
+ @classmethod
62
+ def clear_cache(cls):
63
+ """Clear singleton instances (useful for testing)"""
64
+ cls._instances.clear()
65
+
66
+
67
+ # Auto-registration helper decorator
68
+ def register_plugin(domain_type: str):
69
+ """Decorator to auto-register plugins"""
70
+ def decorator(plugin_class):
71
+ DomainPluginFactory.register(domain_type, plugin_class)
72
+ return plugin_class
73
+ return decorator
services/domain_plugins/research_plugin.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Research/Science Domain Plugin
2
+
3
+ Scores research competency based on:
4
+ - Publication record (papers, citations)
5
+ - Lab experience and duration
6
+ - Research project depth
7
+ - Thesis/dissertation summaries
8
+ """
9
+ import re
10
+ import time
11
+ import logging
12
+ from typing import Dict, List
13
+ from .base_plugin import BaseDomainPlugin, DomainScore
14
+ from .plugin_factory import register_plugin
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @register_plugin('research')
20
+ class ResearchPlugin(BaseDomainPlugin):
21
+ """Research/Science domain scoring plugin"""
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+ # Research indicators
26
+ self.publication_venues = [
27
+ 'journal', 'conference', 'proceedings', 'ieee', 'acm',
28
+ 'springer', 'elsevier', 'nature', 'science', 'arxiv'
29
+ ]
30
+ self.research_methods = [
31
+ 'experiment', 'methodology', 'hypothesis', 'literature review',
32
+ 'data collection', 'statistical analysis', 'simulation', 'survey'
33
+ ]
34
+
35
+ def _get_domain_type(self) -> str:
36
+ return 'research'
37
+
38
+ def _get_feature_weights(self) -> Dict[str, float]:
39
+ return {
40
+ 'publication_score': 0.35,
41
+ 'lab_experience_score': 0.25,
42
+ 'research_depth_score': 0.25,
43
+ 'thesis_quality_score': 0.15
44
+ }
45
+
46
+ def get_required_fields(self) -> List[str]:
47
+ return ['research_description']
48
+
49
+ def get_optional_fields(self) -> List[str]:
50
+ return ['publications_text', 'lab_experience_text', 'thesis_summary']
51
+
52
+ def score(self, evidence_data: Dict) -> DomainScore:
53
+ """Calculate research domain score"""
54
+ start_time = time.time()
55
+ features = {}
56
+
57
+ # Publication analysis
58
+ publications = evidence_data.get('publications_text', '')
59
+ features['publication_score'] = self._analyze_publications(publications)
60
+
61
+ # Lab experience
62
+ lab_exp = evidence_data.get('lab_experience_text', '')
63
+ features['lab_experience_score'] = self._analyze_lab_experience(lab_exp)
64
+
65
+ # Research depth from main description
66
+ research_desc = evidence_data.get('research_description', '')
67
+ features['research_depth_score'] = self._analyze_research_depth(research_desc)
68
+
69
+ # Thesis quality
70
+ thesis = evidence_data.get('thesis_summary', '')
71
+ features['thesis_quality_score'] = self._analyze_thesis(thesis)
72
+
73
+ # Calculate weighted score
74
+ score = sum(features[k] * self.feature_weights[k] for k in features.keys())
75
+
76
+ # Calculate confidence
77
+ confidence = self.calculate_confidence(evidence_data)
78
+
79
+ processing_time = (time.time() - start_time) * 1000
80
+
81
+ return DomainScore(
82
+ domain_type='research',
83
+ score=min(score, 1.0),
84
+ confidence=confidence,
85
+ raw_features=features,
86
+ processing_time_ms=processing_time
87
+ )
88
+
89
+ def _analyze_publications(self, publications_text: str) -> float:
90
+ """
91
+ Analyze publication record
92
+ Returns: 0-1 score based on number and quality of publications
93
+ """
94
+ if not publications_text or len(publications_text) < 30:
95
+ return 0.0
96
+
97
+ text_lower = publications_text.lower()
98
+ score = 0.0
99
+
100
+ # Count publication mentions (by common patterns)
101
+ # Pattern: "Paper title" or [1] Reference format
102
+ title_patterns = [
103
+ r'"([^"]+)"', # Quoted titles
104
+ r'\[\d+\]', # Numbered references
105
+ r'\d{4}\.\s', # Year format (2023. Title...)
106
+ ]
107
+
108
+ pub_count = 0
109
+ for pattern in title_patterns:
110
+ matches = re.findall(pattern, publications_text)
111
+ pub_count = max(pub_count, len(matches))
112
+
113
+ # Score based on publication count
114
+ count_score = min(pub_count / 5, 0.6) # 5+ pubs = 0.6
115
+ score += count_score
116
+
117
+ # Venue quality bonus
118
+ venue_count = sum(1 for venue in self.publication_venues if venue in text_lower)
119
+ venue_score = min(venue_count / 3, 0.4) # 3+ venues = 0.4
120
+ score += venue_score
121
+
122
+ logger.info(f"Publication score: {score:.2f} ({pub_count} pubs, {venue_count} venues)")
123
+ return min(score, 1.0)
124
+
125
+ def _analyze_lab_experience(self, lab_text: str) -> float:
126
+ """
127
+ Analyze laboratory experience
128
+ Returns: 0-1 score based on duration and depth
129
+ """
130
+ if not lab_text or len(lab_text) < 30:
131
+ return 0.0
132
+
133
+ text_lower = lab_text.lower()
134
+ score = 0.0
135
+
136
+ # Extract duration (months/years)
137
+ duration_patterns = [
138
+ (r'(\d+)\s*years?', 12), # Convert years to months
139
+ (r'(\d+)\s*months?', 1),
140
+ ]
141
+
142
+ max_duration = 0
143
+ for pattern, multiplier in duration_patterns:
144
+ matches = re.findall(pattern, text_lower)
145
+ if matches:
146
+ duration = max([int(m) * multiplier for m in matches])
147
+ max_duration = max(max_duration, duration)
148
+
149
+ # Duration score (12 months = max)
150
+ duration_score = min(max_duration / 12, 0.5)
151
+ score += duration_score
152
+
153
+ # Lab quality indicators
154
+ quality_keywords = ['research lab', 'professor', 'phd', 'equipment', 'experiment', 'protocol']
155
+ quality_count = sum(1 for kw in quality_keywords if kw in text_lower)
156
+ quality_score = min(quality_count / 4, 0.5)
157
+ score += quality_score
158
+
159
+ logger.info(f"Lab experience: {score:.2f} ({max_duration} months)")
160
+ return min(score, 1.0)
161
+
162
+ def _analyze_research_depth(self, research_desc: str) -> float:
163
+ """
164
+ Analyze research methodology depth
165
+ Returns: 0-1 score based on methodology sophistication
166
+ """
167
+ if not research_desc or len(research_desc) < 50:
168
+ return 0.0
169
+
170
+ text_lower = research_desc.lower()
171
+ score = 0.0
172
+
173
+ # Research method mentions
174
+ method_count = sum(1 for method in self.research_methods if method in text_lower)
175
+ method_score = min(method_count / 4, 0.5)
176
+ score += method_score
177
+
178
+ # Technical depth indicators
179
+ technical_terms = [
180
+ 'algorithm', 'model', 'framework', 'dataset', 'validation',
181
+ 'baseline', 'benchmark', 'evaluation', 'metrics', 'results'
182
+ ]
183
+ tech_count = sum(1 for term in technical_terms if term in text_lower)
184
+ tech_score = min(tech_count / 5, 0.3)
185
+ score += tech_score
186
+
187
+ # Length as depth proxy
188
+ length_score = min(len(research_desc) / 1000, 0.2)
189
+ score += length_score
190
+
191
+ logger.info(f"Research depth: {score:.2f}")
192
+ return min(score, 1.0)
193
+
194
+ def _analyze_thesis(self, thesis_text: str) -> float:
195
+ """
196
+ Analyze thesis/dissertation quality
197
+ Returns: 0-1 score based on structure and depth
198
+ """
199
+ if not thesis_text or len(thesis_text) < 100:
200
+ return 0.0
201
+
202
+ text_lower = thesis_text.lower()
203
+ score = 0.0
204
+
205
+ # Thesis structure keywords
206
+ structure_keywords = [
207
+ 'abstract', 'introduction', 'methodology', 'results',
208
+ 'discussion', 'conclusion', 'references', 'chapter'
209
+ ]
210
+ structure_count = sum(1 for kw in structure_keywords if kw in text_lower)
211
+ structure_score = min(structure_count / 5, 0.5)
212
+ score += structure_score
213
+
214
+ # Academic rigor indicators
215
+ rigor_keywords = [
216
+ 'research question', 'objective', 'contribution', 'limitation',
217
+ 'future work', 'significance', 'novelty', 'finding'
218
+ ]
219
+ rigor_count = sum(1 for kw in rigor_keywords if kw in text_lower)
220
+ rigor_score = min(rigor_count / 4, 0.3)
221
+ score += rigor_score
222
+
223
+ # Length bonus
224
+ length_score = min(len(thesis_text) / 2000, 0.2)
225
+ score += length_score
226
+
227
+ logger.info(f"Thesis quality: {score:.2f}")
228
+ return min(score, 1.0)
services/domain_plugins/tech_plugin.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tech/CS Domain Plugin
2
+
3
+ Scores technical competency based on:
4
+ - GitHub activity (commits, repos, stars, descriptions)
5
+ - LeetCode profile (problems solved, ranking)
6
+ - Portfolio links (project depth analysis)
7
+ """
8
+ import re
9
+ import time
10
+ import logging
11
+ import requests
12
+ from typing import Dict, List
13
+ from .base_plugin import BaseDomainPlugin, DomainScore
14
+ from .plugin_factory import register_plugin
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @register_plugin('tech')
20
+ class TechPlugin(BaseDomainPlugin):
21
+ """Technical domain scoring plugin"""
22
+
23
+ def _get_domain_type(self) -> str:
24
+ return 'tech'
25
+
26
+ def _get_feature_weights(self) -> Dict[str, float]:
27
+ return {
28
+ 'github_activity_score': 0.30,
29
+ 'github_repo_quality': 0.20,
30
+ 'leetcode_score': 0.25,
31
+ 'portfolio_depth': 0.15,
32
+ 'recent_activity': 0.10
33
+ }
34
+
35
+ def get_required_fields(self) -> List[str]:
36
+ return ['github_url'] # At least GitHub is required
37
+
38
+ def get_optional_fields(self) -> List[str]:
39
+ return ['leetcode_handle', 'portfolio_url', 'linkedin_url']
40
+
41
+ def score(self, evidence_data: Dict) -> DomainScore:
42
+ """Calculate tech domain score"""
43
+ start_time = time.time()
44
+ features = {}
45
+
46
+ # GitHub analysis
47
+ github_url = evidence_data.get('github_url', '')
48
+ if github_url:
49
+ features['github_activity_score'] = self._analyze_github_activity(github_url)
50
+ features['github_repo_quality'] = self._analyze_repo_quality(github_url)
51
+ features['recent_activity'] = self._check_recent_commits(github_url)
52
+ else:
53
+ features['github_activity_score'] = 0.0
54
+ features['github_repo_quality'] = 0.0
55
+ features['recent_activity'] = 0.0
56
+
57
+ # LeetCode analysis
58
+ leetcode_handle = evidence_data.get('leetcode_handle', '')
59
+ if leetcode_handle:
60
+ features['leetcode_score'] = self._analyze_leetcode(leetcode_handle)
61
+ else:
62
+ features['leetcode_score'] = 0.0
63
+
64
+ # Portfolio analysis
65
+ portfolio_url = evidence_data.get('portfolio_url', '')
66
+ if portfolio_url:
67
+ features['portfolio_depth'] = self._analyze_portfolio(portfolio_url)
68
+ else:
69
+ features['portfolio_depth'] = 0.0
70
+
71
+ # Calculate weighted score
72
+ score = sum(features[k] * self.feature_weights[k] for k in features.keys())
73
+
74
+ # Calculate confidence
75
+ confidence = self.calculate_confidence(evidence_data)
76
+
77
+ processing_time = (time.time() - start_time) * 1000
78
+
79
+ return DomainScore(
80
+ domain_type='tech',
81
+ score=min(score, 1.0),
82
+ confidence=confidence,
83
+ raw_features=features,
84
+ processing_time_ms=processing_time
85
+ )
86
+
87
+ def _analyze_github_activity(self, github_url: str) -> float:
88
+ """
89
+ Analyze GitHub profile activity
90
+ Returns: 0-1 score based on public repos, commits, contributions
91
+ """
92
+ try:
93
+ username = self._extract_github_username(github_url)
94
+ if not username:
95
+ return 0.0
96
+
97
+ # GitHub API endpoint
98
+ api_url = f"https://api.github.com/users/{username}"
99
+ headers = {'Accept': 'application/vnd.github.v3+json'}
100
+
101
+ response = requests.get(api_url, headers=headers, timeout=5)
102
+
103
+ if response.status_code != 200:
104
+ logger.warning(f"GitHub API error for {username}: {response.status_code}")
105
+ return 0.3 # Fallback score if API fails
106
+
107
+ data = response.json()
108
+
109
+ # Extract metrics
110
+ public_repos = data.get('public_repos', 0)
111
+ followers = data.get('followers', 0)
112
+ following = data.get('following', 0)
113
+
114
+ # Simple scoring heuristic
115
+ repo_score = min(public_repos / 20, 1.0) * 0.5 # 20+ repos = max
116
+ follower_score = min(followers / 50, 1.0) * 0.3 # 50+ followers = max
117
+ engagement_score = min((followers + following) / 100, 1.0) * 0.2
118
+
119
+ total_score = repo_score + follower_score + engagement_score
120
+
121
+ logger.info(f"GitHub activity for {username}: {total_score:.2f}")
122
+ return total_score
123
+
124
+ except Exception as e:
125
+ logger.error(f"Error analyzing GitHub activity: {e}")
126
+ return 0.3 # Fallback score
127
+
128
+ def _analyze_repo_quality(self, github_url: str) -> float:
129
+ """
130
+ Analyze quality of top repositories
131
+ Returns: 0-1 score based on stars, forks, descriptions
132
+ """
133
+ try:
134
+ username = self._extract_github_username(github_url)
135
+ if not username:
136
+ return 0.0
137
+
138
+ # Get repos sorted by stars
139
+ api_url = f"https://api.github.com/users/{username}/repos?sort=stars&per_page=10"
140
+ headers = {'Accept': 'application/vnd.github.v3+json'}
141
+
142
+ response = requests.get(api_url, headers=headers, timeout=5)
143
+
144
+ if response.status_code != 200:
145
+ return 0.3
146
+
147
+ repos = response.json()
148
+
149
+ if not repos:
150
+ return 0.0
151
+
152
+ # Analyze top repos
153
+ total_stars = sum(r.get('stargazers_count', 0) for r in repos[:5])
154
+ total_forks = sum(r.get('forks_count', 0) for r in repos[:5])
155
+ has_descriptions = sum(1 for r in repos[:5] if r.get('description'))
156
+ has_readmes = sum(1 for r in repos[:5] if r.get('has_wiki') or r.get('has_pages'))
157
+
158
+ star_score = min(total_stars / 50, 1.0) * 0.4 # 50+ stars = max
159
+ fork_score = min(total_forks / 20, 1.0) * 0.2 # 20+ forks = max
160
+ desc_score = (has_descriptions / 5) * 0.2
161
+ readme_score = (has_readmes / 5) * 0.2
162
+
163
+ total_score = star_score + fork_score + desc_score + readme_score
164
+
165
+ logger.info(f"GitHub repo quality for {username}: {total_score:.2f}")
166
+ return total_score
167
+
168
+ except Exception as e:
169
+ logger.error(f"Error analyzing repo quality: {e}")
170
+ return 0.3
171
+
172
+ def _check_recent_commits(self, github_url: str) -> float:
173
+ """
174
+ Check for recent activity (commits in last 90 days)
175
+ Returns: 0-1 score based on recency
176
+ """
177
+ try:
178
+ username = self._extract_github_username(github_url)
179
+ if not username:
180
+ return 0.0
181
+
182
+ # Get recent events
183
+ api_url = f"https://api.github.com/users/{username}/events/public?per_page=30"
184
+ headers = {'Accept': 'application/vnd.github.v3+json'}
185
+
186
+ response = requests.get(api_url, headers=headers, timeout=5)
187
+
188
+ if response.status_code != 200:
189
+ return 0.5 # Neutral fallback
190
+
191
+ events = response.json()
192
+
193
+ # Count push events (commits) in last 90 days
194
+ from datetime import datetime, timedelta
195
+ ninety_days_ago = datetime.now() - timedelta(days=90)
196
+
197
+ recent_commits = 0
198
+ for event in events:
199
+ if event.get('type') == 'PushEvent':
200
+ created_at = datetime.strptime(event['created_at'], '%Y-%m-%dT%H:%M:%SZ')
201
+ if created_at > ninety_days_ago:
202
+ recent_commits += 1
203
+
204
+ # Score based on commit frequency
205
+ score = min(recent_commits / 20, 1.0) # 20+ commits in 90 days = max
206
+
207
+ logger.info(f"Recent activity for {username}: {score:.2f} ({recent_commits} commits)")
208
+ return score
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error checking recent activity: {e}")
212
+ return 0.5
213
+
214
+ def _analyze_leetcode(self, leetcode_handle: str) -> float:
215
+ """
216
+ Analyze LeetCode profile
217
+ Returns: 0-1 score based on problems solved and ranking
218
+
219
+ Note: LeetCode doesn't have an official public API, so this uses heuristic fallback
220
+ In production, consider using unofficial APIs or web scraping with proper rate limiting
221
+ """
222
+ try:
223
+ # Placeholder for LeetCode analysis
224
+ # In real implementation, would scrape profile or use unofficial API
225
+
226
+ # For now, return medium score if handle is provided
227
+ logger.info(f"LeetCode handle provided: {leetcode_handle}")
228
+ return 0.5 # Neutral score when handle exists
229
+
230
+ except Exception as e:
231
+ logger.error(f"Error analyzing LeetCode: {e}")
232
+ return 0.0
233
+
234
+ def _analyze_portfolio(self, portfolio_url: str) -> float:
235
+ """
236
+ Analyze portfolio website
237
+ Returns: 0-1 score based on presence and basic checks
238
+ """
239
+ try:
240
+ # Basic URL validation
241
+ if not portfolio_url.startswith(('http://', 'https://')):
242
+ portfolio_url = 'https://' + portfolio_url
243
+
244
+ # Check if URL is accessible
245
+ response = requests.head(portfolio_url, timeout=5, allow_redirects=True)
246
+
247
+ if response.status_code == 200:
248
+ # Portfolio exists and is accessible
249
+ score = 0.7
250
+
251
+ # Bonus for custom domain (not github.io, netlify.app, etc.)
252
+ if not any(host in portfolio_url for host in ['github.io', 'netlify.app', 'vercel.app', 'repl.it']):
253
+ score += 0.3
254
+
255
+ logger.info(f"Portfolio accessible: {portfolio_url} (score: {score})")
256
+ return min(score, 1.0)
257
+ else:
258
+ logger.warning(f"Portfolio not accessible: {portfolio_url}")
259
+ return 0.2 # Some credit for providing URL
260
+
261
+ except Exception as e:
262
+ logger.error(f"Error analyzing portfolio: {e}")
263
+ return 0.2
264
+
265
+ def _extract_github_username(self, github_url: str) -> str:
266
+ """Extract username from GitHub URL"""
267
+ # Handle formats: https://github.com/username or github.com/username
268
+ pattern = r'github\.com/([a-zA-Z0-9_-]+)'
269
+ match = re.search(pattern, github_url)
270
+ return match.group(1) if match else ''
services/fidelity_transformer.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fidelity Criteria Transformer - Enhanced aspect extraction with semantic validation
3
+ """
4
+ import os
5
+ import logging
6
+ import numpy as np
7
+ from typing import Dict, List, Tuple, Optional, Any
8
+ from dataclasses import dataclass
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Try importing sentence transformers
13
+ try:
14
+ from sentence_transformers import SentenceTransformer
15
+ HAS_SBERT = True
16
+ except ImportError:
17
+ HAS_SBERT = False
18
+ logger.warning("sentence-transformers not installed, using fallback")
19
+
20
+ # Try importing domain knowledge base
21
+ try:
22
+ from .domain_knowledge_base import get_domain_knowledge_base, DomainKnowledgeBase
23
+ HAS_DKB = True
24
+ except ImportError:
25
+ HAS_DKB = False
26
+ logger.warning("DomainKnowledgeBase not available")
27
+
28
+
29
+ @dataclass
30
+ class FidelityScore:
31
+ """Fidelity assessment result"""
32
+ score: float # 0-1 overall fidelity
33
+ coherence: float # Semantic coherence
34
+ coverage: float # Aspect coverage
35
+ depth: float # Content depth
36
+ issues: List[str] # List of detected issues
37
+
38
+
39
+ @dataclass
40
+ class AspectExtractionResult:
41
+ """Result of aspect extraction for a text"""
42
+ aspects: Dict[str, float] # aspect -> score
43
+ chunk_evidence: Dict[str, List[str]] # aspect -> supporting chunks
44
+ fidelity: FidelityScore
45
+ detected_domain: str
46
+ domain_confidence: float
47
+
48
+
49
+ class FidelityScorer:
50
+ """
51
+ Semantic fidelity scoring for text responses
52
+ Validates response quality against expected patterns
53
+ """
54
+
55
+ # Generic/copy-paste patterns to detect
56
+ GENERIC_PATTERNS = [
57
+ "i am a hard worker",
58
+ "i have good communication skills",
59
+ "i am a team player",
60
+ "i want to learn and grow",
61
+ "i am passionate about",
62
+ "looking for opportunities",
63
+ "seeking challenging role"
64
+ ]
65
+
66
+ # Minimum thresholds
67
+ MIN_WORD_COUNT = 30
68
+ IDEAL_WORD_COUNT = 150
69
+ MAX_WORD_COUNT = 500
70
+
71
+ def __init__(self):
72
+ self.generic_patterns = [p.lower() for p in self.GENERIC_PATTERNS]
73
+
74
+ def score(self, text: str, aspect_scores: Dict[str, float],
75
+ expected_aspects: List[str] = None) -> FidelityScore:
76
+ """
77
+ Compute fidelity score for a text response
78
+
79
+ Args:
80
+ text: The text to evaluate
81
+ aspect_scores: Scores from aspect extraction
82
+ expected_aspects: Aspects expected in this response
83
+
84
+ Returns:
85
+ FidelityScore with detailed breakdown
86
+ """
87
+ issues = []
88
+
89
+ if not text or len(text.strip()) < 10:
90
+ return FidelityScore(
91
+ score=0.0, coherence=0.0, coverage=0.0, depth=0.0,
92
+ issues=["Response is too short or empty"]
93
+ )
94
+
95
+ text_lower = text.lower()
96
+ word_count = len(text.split())
97
+
98
+ # 1. Content Depth Score
99
+ if word_count < self.MIN_WORD_COUNT:
100
+ depth = 0.2
101
+ issues.append(f"Response too short ({word_count} words, minimum {self.MIN_WORD_COUNT})")
102
+ elif word_count < self.IDEAL_WORD_COUNT:
103
+ depth = 0.5 + 0.3 * (word_count - self.MIN_WORD_COUNT) / (self.IDEAL_WORD_COUNT - self.MIN_WORD_COUNT)
104
+ elif word_count <= self.MAX_WORD_COUNT:
105
+ depth = 1.0
106
+ else:
107
+ depth = 0.9 # Slightly penalize overly long responses
108
+ issues.append("Response is longer than recommended")
109
+
110
+ # 2. Generic Pattern Detection
111
+ generic_count = sum(1 for p in self.generic_patterns if p in text_lower)
112
+ coherence_penalty = min(generic_count * 0.1, 0.4)
113
+
114
+ if generic_count > 2:
115
+ issues.append(f"Contains {generic_count} generic phrases")
116
+
117
+ # 3. Coherence Score (based on sentence structure and vocabulary)
118
+ sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 10]
119
+
120
+ if len(sentences) < 2:
121
+ coherence = 0.4
122
+ issues.append("Response lacks proper sentence structure")
123
+ else:
124
+ # Check vocabulary diversity
125
+ words = text_lower.split()
126
+ unique_ratio = len(set(words)) / max(len(words), 1)
127
+
128
+ # Base coherence
129
+ coherence = 0.6 + 0.2 * unique_ratio
130
+
131
+ # Apply generic penalty
132
+ coherence = max(0.2, coherence - coherence_penalty)
133
+
134
+ # 4. Aspect Coverage Score
135
+ if expected_aspects and aspect_scores:
136
+ covered = sum(1 for a in expected_aspects
137
+ if aspect_scores.get(a, 0) > 0.4)
138
+ coverage = covered / len(expected_aspects)
139
+
140
+ if coverage < 0.5:
141
+ issues.append(f"Only {covered}/{len(expected_aspects)} expected aspects covered")
142
+ else:
143
+ # Use variance of aspect scores as proxy
144
+ if aspect_scores:
145
+ scores = list(aspect_scores.values())
146
+ max_score = max(scores)
147
+ coverage = min(max_score + 0.2, 1.0) # Having any strong aspect is good
148
+ else:
149
+ coverage = 0.3
150
+
151
+ # 5. Overall Fidelity Score
152
+ fidelity_score = (
153
+ 0.30 * depth +
154
+ 0.35 * coherence +
155
+ 0.35 * coverage
156
+ )
157
+
158
+ return FidelityScore(
159
+ score=round(fidelity_score, 3),
160
+ coherence=round(coherence, 3),
161
+ coverage=round(coverage, 3),
162
+ depth=round(depth, 3),
163
+ issues=issues
164
+ )
165
+
166
+
167
+ class FidelityCriteriaTransformer:
168
+ """
169
+ Enhanced aspect extraction with domain-aware prototypes and fidelity validation
170
+
171
+ Key improvements over TextModuleV2:
172
+ 1. Domain-specific aspect prototypes from DomainKnowledgeBase
173
+ 2. Fidelity scoring for response quality validation
174
+ 3. Unified extraction interface with rich output
175
+ """
176
+
177
+ def __init__(self,
178
+ model_name: str = None,
179
+ domains_dir: str = None,
180
+ use_gpu: bool = False):
181
+ """
182
+ Initialize FCT
183
+
184
+ Args:
185
+ model_name: Sentence transformer model (default: all-mpnet-base-v2)
186
+ domains_dir: Path to domain config directory
187
+ use_gpu: Whether to use GPU for encoding
188
+ """
189
+ self.model_name = model_name or os.getenv('FCT_MODEL_NAME', 'all-mpnet-base-v2')
190
+ self.device = 'cuda' if use_gpu else 'cpu'
191
+
192
+ # Initialize encoder
193
+ if HAS_SBERT:
194
+ logger.info(f"Loading FCT model: {self.model_name}")
195
+ self.encoder = SentenceTransformer(self.model_name, device=self.device)
196
+ else:
197
+ self.encoder = None
198
+ logger.warning("Running in fallback mode without sentence transformers")
199
+
200
+ # Initialize domain knowledge base
201
+ if HAS_DKB:
202
+ self.dkb = get_domain_knowledge_base(domains_dir)
203
+ else:
204
+ self.dkb = None
205
+
206
+ # Initialize fidelity scorer
207
+ self.fidelity_scorer = FidelityScorer()
208
+
209
+ # Cache for centroids (domain -> aspect -> centroid)
210
+ self._centroid_cache: Dict[str, Dict[str, np.ndarray]] = {}
211
+
212
+ # Default aspects (fallback when no domain detected)
213
+ self.default_aspects = [
214
+ 'technical_skills', 'problem_solving', 'leadership',
215
+ 'communication', 'teamwork', 'initiative', 'learning_agility'
216
+ ]
217
+
218
+ logger.info("FidelityCriteriaTransformer initialized")
219
+
220
+ def _get_centroids(self, domain_id: str,
221
+ aspect_seeds: Dict[str, List[str]]) -> Dict[str, np.ndarray]:
222
+ """Get or compute centroids for aspects"""
223
+ cache_key = domain_id
224
+
225
+ if cache_key in self._centroid_cache:
226
+ return self._centroid_cache[cache_key]
227
+
228
+ if not self.encoder:
229
+ return {}
230
+
231
+ centroids = {}
232
+ for aspect, seeds in aspect_seeds.items():
233
+ if not seeds:
234
+ continue
235
+
236
+ # Encode seeds
237
+ embeddings = self.encoder.encode(seeds, convert_to_tensor=False,
238
+ show_progress_bar=False)
239
+ embeddings = np.array(embeddings, dtype=np.float32)
240
+
241
+ # Compute normalized centroid
242
+ centroid = np.mean(embeddings, axis=0)
243
+ centroid = centroid / (np.linalg.norm(centroid) + 1e-8)
244
+ centroids[aspect] = centroid
245
+
246
+ self._centroid_cache[cache_key] = centroids
247
+ return centroids
248
+
249
+ def _split_text(self, text: str, max_chunks: int = 20) -> List[str]:
250
+ """Split text into chunks for scoring"""
251
+ import re
252
+
253
+ # Split by sentences
254
+ sentences = re.split(r'[.!?]+', text)
255
+ chunks = [s.strip() for s in sentences if len(s.strip()) > 20]
256
+
257
+ # If too few, use sliding window
258
+ if len(chunks) < 3:
259
+ words = text.split()
260
+ window_size = 50
261
+ step = 25
262
+ chunks = []
263
+ for i in range(0, max(1, len(words) - window_size + 1), step):
264
+ chunk = ' '.join(words[i:i+window_size])
265
+ if len(chunk) > 20:
266
+ chunks.append(chunk)
267
+
268
+ return chunks[:max_chunks]
269
+
270
+ def extract_aspects(self,
271
+ text: str,
272
+ domain_hint: str = None,
273
+ skills: List[str] = None,
274
+ expected_aspects: List[str] = None) -> AspectExtractionResult:
275
+ """
276
+ Extract aspects from text with fidelity validation
277
+
278
+ Args:
279
+ text: Text to analyze
280
+ domain_hint: Optional domain ID to use
281
+ skills: Optional list of skills for domain detection
282
+ expected_aspects: Optional list of expected aspects
283
+
284
+ Returns:
285
+ AspectExtractionResult with scores, evidence, and fidelity
286
+ """
287
+ # 1. Domain Detection
288
+ if domain_hint and self.dkb and domain_hint in self.dkb.list_domains():
289
+ detected_domain = domain_hint
290
+ domain_confidence = 1.0
291
+ elif self.dkb:
292
+ detected_domain, domain_confidence = self.dkb.detect_domain(text, skills)
293
+ else:
294
+ detected_domain = 'general'
295
+ domain_confidence = 0.0
296
+
297
+ # 2. Get aspect prototypes
298
+ if self.dkb and detected_domain != 'general':
299
+ aspect_seeds = self.dkb.get_aspect_prototypes_for_domain(detected_domain)
300
+ else:
301
+ aspect_seeds = {}
302
+
303
+ # 3. Fallback to default aspects if needed
304
+ if not aspect_seeds:
305
+ # Use minimal default seeds
306
+ aspect_seeds = {
307
+ 'technical_skills': ['developed software', 'built systems', 'coded in python'],
308
+ 'problem_solving': ['solved problems', 'debugged issues', 'optimized performance'],
309
+ 'leadership': ['led team', 'managed project', 'organized event'],
310
+ 'communication': ['presented to', 'wrote documentation', 'explained to']
311
+ }
312
+
313
+ # 4. Get/compute centroids
314
+ centroids = self._get_centroids(detected_domain, aspect_seeds)
315
+
316
+ # 5. Score text against aspects
317
+ aspect_scores = {}
318
+ chunk_evidence = {aspect: [] for aspect in centroids.keys()}
319
+
320
+ if not text or len(text) < 20 or not self.encoder:
321
+ # Return empty result
322
+ fidelity = self.fidelity_scorer.score(text, {}, expected_aspects)
323
+ return AspectExtractionResult(
324
+ aspects={},
325
+ chunk_evidence={},
326
+ fidelity=fidelity,
327
+ detected_domain=detected_domain,
328
+ domain_confidence=domain_confidence
329
+ )
330
+
331
+ # Split and encode text
332
+ chunks = self._split_text(text)
333
+ if not chunks:
334
+ fidelity = self.fidelity_scorer.score(text, {}, expected_aspects)
335
+ return AspectExtractionResult(
336
+ aspects={},
337
+ chunk_evidence={},
338
+ fidelity=fidelity,
339
+ detected_domain=detected_domain,
340
+ domain_confidence=domain_confidence
341
+ )
342
+
343
+ chunk_embeddings = self.encoder.encode(chunks, convert_to_tensor=False,
344
+ show_progress_bar=False)
345
+ chunk_embeddings = np.array(chunk_embeddings, dtype=np.float32)
346
+
347
+ # Score each aspect
348
+ for aspect, centroid in centroids.items():
349
+ # Cosine similarities
350
+ sims = np.dot(chunk_embeddings, centroid) / (
351
+ np.linalg.norm(chunk_embeddings, axis=1) * np.linalg.norm(centroid) + 1e-8
352
+ )
353
+
354
+ # Scoring: weighted max + mean of top-k
355
+ max_sim = float(np.max(sims))
356
+ top_k = 3
357
+ topk_sims = np.partition(sims, -min(top_k, len(sims)))[-top_k:]
358
+ mean_topk = float(np.mean(topk_sims))
359
+
360
+ # Normalize to 0-1
361
+ raw_score = 0.6 * max_sim + 0.4 * mean_topk
362
+ normalized = (raw_score + 1) / 2
363
+ aspect_scores[aspect] = float(np.clip(normalized, 0, 1))
364
+
365
+ # Collect evidence chunks
366
+ threshold = 0.35
367
+ for i, sim in enumerate(sims):
368
+ if sim > threshold:
369
+ chunk_evidence[aspect].append(chunks[i])
370
+
371
+ # 6. Fidelity scoring
372
+ fidelity = self.fidelity_scorer.score(text, aspect_scores, expected_aspects)
373
+
374
+ return AspectExtractionResult(
375
+ aspects=aspect_scores,
376
+ chunk_evidence=chunk_evidence,
377
+ fidelity=fidelity,
378
+ detected_domain=detected_domain,
379
+ domain_confidence=domain_confidence
380
+ )
381
+
382
+ def score_student_text(self, text_responses: Dict[str, str],
383
+ domain_hint: str = None,
384
+ skills: List[str] = None) -> Dict[str, Any]:
385
+ """
386
+ Score all text responses for a student
387
+
388
+ Args:
389
+ text_responses: Dict with text_q1, text_q2, text_q3
390
+ domain_hint: Optional domain ID
391
+ skills: Optional skills list for domain detection
392
+
393
+ Returns:
394
+ Comprehensive scoring result
395
+ """
396
+ text_q1 = text_responses.get('text_q1', '')
397
+ text_q2 = text_responses.get('text_q2', '')
398
+ text_q3 = text_responses.get('text_q3', '')
399
+
400
+ # Combined text for domain detection
401
+ combined_text = f"{text_q1} {text_q2} {text_q3}"
402
+
403
+ # Expected aspects per question
404
+ q1_aspects = ['technical_skills', 'problem_solving', 'learning_agility']
405
+ q2_aspects = ['career_alignment', 'initiative', 'learning_agility']
406
+ q3_aspects = ['leadership', 'teamwork', 'communication']
407
+
408
+ # Extract aspects per question
409
+ q1_result = self.extract_aspects(text_q1, domain_hint, skills, q1_aspects)
410
+ q2_result = self.extract_aspects(text_q2, domain_hint, skills, q2_aspects)
411
+ q3_result = self.extract_aspects(text_q3, domain_hint, skills, q3_aspects)
412
+
413
+ # Aggregate scores
414
+ all_aspects = {}
415
+ for result in [q1_result, q2_result, q3_result]:
416
+ for aspect, score in result.aspects.items():
417
+ if aspect in all_aspects:
418
+ all_aspects[aspect] = max(all_aspects[aspect], score)
419
+ else:
420
+ all_aspects[aspect] = score
421
+
422
+ # Overall metrics
423
+ avg_fidelity = np.mean([
424
+ q1_result.fidelity.score,
425
+ q2_result.fidelity.score,
426
+ q3_result.fidelity.score
427
+ ])
428
+
429
+ # Compute weighted text score
430
+ weights = {
431
+ 'technical_skills': 0.15,
432
+ 'problem_solving': 0.10,
433
+ 'leadership': 0.20,
434
+ 'communication': 0.15,
435
+ 'teamwork': 0.10,
436
+ 'learning_agility': 0.10,
437
+ 'initiative': 0.10,
438
+ 'career_alignment': 0.10
439
+ }
440
+
441
+ weighted_score = sum(
442
+ all_aspects.get(aspect, 0.3) * weight
443
+ for aspect, weight in weights.items()
444
+ )
445
+
446
+ # Confidence based on fidelity
447
+ confidence = avg_fidelity
448
+
449
+ return {
450
+ 'score': round(weighted_score, 3),
451
+ 'confidence': round(confidence, 3),
452
+ 'detected_domain': q1_result.detected_domain,
453
+ 'domain_confidence': round(q1_result.domain_confidence, 3),
454
+ 'aspects': {k: round(v, 3) for k, v in all_aspects.items()},
455
+ 'fidelity': {
456
+ 'overall': round(avg_fidelity, 3),
457
+ 'q1': round(q1_result.fidelity.score, 3),
458
+ 'q2': round(q2_result.fidelity.score, 3),
459
+ 'q3': round(q3_result.fidelity.score, 3)
460
+ },
461
+ 'issues': (
462
+ q1_result.fidelity.issues +
463
+ q2_result.fidelity.issues +
464
+ q3_result.fidelity.issues
465
+ )
466
+ }
467
+
468
+
469
+ # Singleton
470
+ _fct_instance: Optional[FidelityCriteriaTransformer] = None
471
+
472
+
473
+ def get_fidelity_transformer(model_name: str = None,
474
+ domains_dir: str = None) -> FidelityCriteriaTransformer:
475
+ """Get or create singleton FCT instance"""
476
+ global _fct_instance
477
+
478
+ if _fct_instance is None:
479
+ _fct_instance = FidelityCriteriaTransformer(model_name, domains_dir)
480
+
481
+ return _fct_instance
services/fusion.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fusion Engine - Confidence-weighted Score Fusion"""
2
+ from typing import Dict, Tuple, Optional
3
+ from config import Config
4
+
5
+ class FusionEngine:
6
+ """Combines scores from all modules with confidence weighting"""
7
+
8
+ def __init__(self):
9
+ # Base weights (when no domain evidence)
10
+ self.base_weights = {
11
+ 'universal': Config.UNIVERSAL_WEIGHT,
12
+ 'personality': Config.PERSONALITY_WEIGHT,
13
+ 'text': Config.TEXT_WEIGHT
14
+ }
15
+
16
+ # Extended weights (when domain evidence exists)
17
+ self.extended_weights = {
18
+ 'universal': 0.30, # Reduced from base
19
+ 'personality': 0.25,
20
+ 'text': 0.25,
21
+ 'domain': 0.20 # New domain component
22
+ }
23
+
24
+ def fuse_scores(
25
+ self,
26
+ universal_score: float,
27
+ universal_confidence: float,
28
+ personality_score: float,
29
+ personality_confidence: float,
30
+ text_score: float,
31
+ text_confidence: float,
32
+ domain_score: Optional[float] = None,
33
+ domain_confidence: Optional[float] = None
34
+ ) -> Tuple[float, Dict]:
35
+ """
36
+ Fuse scores with confidence weighting
37
+ Supports optional domain score for pluggable domain evidence
38
+
39
+ Returns: (final_score, breakdown)
40
+ """
41
+
42
+ # Determine which weights to use
43
+ has_domain = domain_score is not None and domain_confidence is not None and domain_confidence > 0
44
+ weights = self.extended_weights if has_domain else self.base_weights
45
+
46
+ # Calculate effective weights (weight * confidence)
47
+ effective_weights = {
48
+ 'universal': weights['universal'] * universal_confidence,
49
+ 'personality': weights['personality'] * personality_confidence,
50
+ 'text': weights['text'] * text_confidence
51
+ }
52
+
53
+ # Add domain if available
54
+ if has_domain:
55
+ effective_weights['domain'] = weights['domain'] * domain_confidence
56
+
57
+ # Sum of effective weights (for normalization)
58
+ total_effective_weight = sum(effective_weights.values())
59
+
60
+ # Prevent division by zero
61
+ if total_effective_weight == 0:
62
+ breakdown = {
63
+ 'final_score': 0.0,
64
+ 'component_scores': {
65
+ 'universal': 0.0,
66
+ 'personality': 0.0,
67
+ 'text': 0.0
68
+ },
69
+ 'confidences': {
70
+ 'universal': 0.0,
71
+ 'personality': 0.0,
72
+ 'text': 0.0
73
+ },
74
+ 'effective_weights': effective_weights,
75
+ 'has_domain': False
76
+ }
77
+
78
+ if has_domain:
79
+ breakdown['component_scores']['domain'] = 0.0
80
+ breakdown['confidences']['domain'] = 0.0
81
+
82
+ return 0.0, breakdown
83
+
84
+ # Calculate fused score
85
+ fused_score = (
86
+ effective_weights['universal'] * universal_score +
87
+ effective_weights['personality'] * personality_score +
88
+ effective_weights['text'] * text_score
89
+ )
90
+
91
+ if has_domain:
92
+ fused_score += effective_weights['domain'] * domain_score
93
+
94
+ fused_score /= total_effective_weight
95
+
96
+ # Prepare breakdown
97
+ breakdown = {
98
+ 'final_score': round(fused_score, 4),
99
+ 'component_scores': {
100
+ 'universal': round(universal_score, 4),
101
+ 'personality': round(personality_score, 4),
102
+ 'text': round(text_score, 4)
103
+ },
104
+ 'confidences': {
105
+ 'universal': round(universal_confidence, 4),
106
+ 'personality': round(personality_confidence, 4),
107
+ 'text': round(text_confidence, 4)
108
+ },
109
+ 'effective_weights': {
110
+ k: round(v / total_effective_weight, 4)
111
+ for k, v in effective_weights.items()
112
+ },
113
+ 'base_weights': weights,
114
+ 'has_domain': has_domain
115
+ }
116
+
117
+ # Add domain info if present
118
+ if has_domain:
119
+ breakdown['component_scores']['domain'] = round(domain_score, 4)
120
+ breakdown['confidences']['domain'] = round(domain_confidence, 4)
121
+
122
+ return fused_score, breakdown
123
+
124
+ def get_grade(self, final_score: float) -> str:
125
+ """Convert score to letter grade"""
126
+ if final_score >= 0.9:
127
+ return 'A+'
128
+ elif final_score >= 0.85:
129
+ return 'A'
130
+ elif final_score >= 0.8:
131
+ return 'A-'
132
+ elif final_score >= 0.75:
133
+ return 'B+'
134
+ elif final_score >= 0.7:
135
+ return 'B'
136
+ elif final_score >= 0.65:
137
+ return 'B-'
138
+ elif final_score >= 0.6:
139
+ return 'C+'
140
+ elif final_score >= 0.55:
141
+ return 'C'
142
+ elif final_score >= 0.5:
143
+ return 'C-'
144
+ else:
145
+ return 'D'
146
+
147
+ def get_percentile(self, final_score: float) -> int:
148
+ """Estimate percentile (mock for MVP)"""
149
+ # In production, this would query actual distribution
150
+ return min(int(final_score * 100), 99)
services/personality_module.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Personality Module - Big Five Trait Scoring"""
2
+ import numpy as np
3
+ from typing import Dict, Tuple
4
+
5
+ class PersonalityModule:
6
+ """Scores personality based on Big Five traits"""
7
+
8
+ def __init__(self):
9
+ # Map questions to traits (reversed questions need inverse scoring)
10
+ self.trait_mapping = {
11
+ 'openness': ['p_q1', 'p_q3', 'p_q4'],
12
+ 'openness_r': ['p_q2'], # Reversed
13
+ 'conscientiousness': ['p_q5', 'p_q7', 'p_q8'],
14
+ 'conscientiousness_r': ['p_q6'], # Reversed
15
+ 'extraversion': ['p_q9', 'p_q11', 'p_q12'],
16
+ 'extraversion_r': ['p_q10'], # Reversed
17
+ 'agreeableness': ['p_q13', 'p_q15', 'p_q16'],
18
+ 'agreeableness_r': ['p_q14'], # Reversed
19
+ 'stability': ['p_q17', 'p_q19', 'p_q20'],
20
+ 'stability_r': ['p_q18'] # Reversed
21
+ }
22
+
23
+ # Weights for employability (some traits matter more)
24
+ self.trait_weights = {
25
+ 'openness': 0.20,
26
+ 'conscientiousness': 0.30,
27
+ 'extraversion': 0.20,
28
+ 'agreeableness': 0.15,
29
+ 'stability': 0.15
30
+ }
31
+
32
+ def score(self, responses: Dict[str, int]) -> Tuple[float, float, Dict]:
33
+ """
34
+ Calculate personality score from 20 questions
35
+ Returns: (score, confidence, trait_scores)
36
+ """
37
+ trait_scores = {}
38
+
39
+ # Calculate each trait score
40
+ for trait in ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'stability']:
41
+ # Get normal questions
42
+ normal_qs = self.trait_mapping[trait]
43
+ reversed_qs = self.trait_mapping[f'{trait}_r']
44
+
45
+ # Calculate average for this trait
46
+ scores = []
47
+
48
+ # Normal questions: higher = better
49
+ for q in normal_qs:
50
+ if q in responses:
51
+ scores.append((responses[q] - 1) / 4.0) # Normalize 1-5 to 0-1
52
+
53
+ # Reversed questions: lower = better
54
+ for q in reversed_qs:
55
+ if q in responses:
56
+ scores.append((5 - responses[q]) / 4.0) # Invert and normalize
57
+
58
+ if scores:
59
+ trait_scores[trait] = np.mean(scores)
60
+ else:
61
+ trait_scores[trait] = 0.5 # Neutral if missing
62
+
63
+ # Calculate overall personality score
64
+ personality_score = sum(
65
+ trait_scores[trait] * self.trait_weights[trait]
66
+ for trait in trait_scores.keys()
67
+ )
68
+
69
+ # Calculate confidence based on question completion
70
+ total_questions = 20
71
+ answered_questions = len(responses)
72
+ confidence = answered_questions / total_questions
73
+
74
+ return personality_score, confidence, trait_scores
75
+
76
+ def explain(self, trait_scores: Dict) -> Dict:
77
+ """Generate explanation for personality scores"""
78
+ explanations = {
79
+ 'top_strengths': [],
80
+ 'areas_for_growth': []
81
+ }
82
+
83
+ # Sort traits by score
84
+ sorted_traits = sorted(trait_scores.items(), key=lambda x: x[1], reverse=True)
85
+
86
+ # Top 2 strengths
87
+ for trait, score in sorted_traits[:2]:
88
+ if score > 0.6:
89
+ explanations['top_strengths'].append({
90
+ 'trait': trait.capitalize(),
91
+ 'score': round(score, 2),
92
+ 'description': self._get_trait_description(trait, score)
93
+ })
94
+
95
+ # Bottom 2 areas for growth
96
+ for trait, score in sorted_traits[-2:]:
97
+ if score < 0.5:
98
+ explanations['areas_for_growth'].append({
99
+ 'trait': trait.capitalize(),
100
+ 'score': round(score, 2),
101
+ 'description': self._get_trait_description(trait, score)
102
+ })
103
+
104
+ return explanations
105
+
106
+ def _get_trait_description(self, trait: str, score: float) -> str:
107
+ """Get description of trait"""
108
+ descriptions = {
109
+ 'openness': {
110
+ 'high': "Highly creative, curious, and open to new experiences",
111
+ 'low': "Prefers routine and traditional approaches"
112
+ },
113
+ 'conscientiousness': {
114
+ 'high': "Very organized, reliable, and goal-oriented",
115
+ 'low': "May benefit from improved organization and planning"
116
+ },
117
+ 'extraversion': {
118
+ 'high': "Energetic, sociable, and thrives in team environments",
119
+ 'low': "Prefers independent work and smaller groups"
120
+ },
121
+ 'agreeableness': {
122
+ 'high': "Cooperative, empathetic, and team-oriented",
123
+ 'low': "Independent thinker, comfortable with competition"
124
+ },
125
+ 'stability': {
126
+ 'high': "Emotionally stable, handles stress well",
127
+ 'low': "May experience stress in high-pressure situations"
128
+ }
129
+ }
130
+
131
+ level = 'high' if score > 0.6 else 'low'
132
+ return descriptions.get(trait, {}).get(level, f"{trait} score: {score:.2f}")
services/student_output.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Student Output Service - Individual student JSON formatting
3
+ Provides structured analysis output for single students
4
+ """
5
+ import logging
6
+ from typing import Dict, List, Any, Optional
7
+ from datetime import datetime
8
+ from dataclasses import dataclass, asdict
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ @dataclass
14
+ class SkillRecommendation:
15
+ """Skill gap recommendation for student"""
16
+ skill: str
17
+ priority: str
18
+ recommended_courses: List[str]
19
+ certifications: List[str]
20
+
21
+
22
+ @dataclass
23
+ class CareerPath:
24
+ """Suggested career path"""
25
+ role: str
26
+ fit_score: float
27
+ requirements_met: List[str]
28
+ requirements_gap: List[str]
29
+
30
+
31
+ class StudentOutputService:
32
+ """
33
+ Formats individual student analysis into structured JSON
34
+ """
35
+
36
+ # Grade thresholds
37
+ GRADE_THRESHOLDS = [
38
+ (0.90, 'A+', 'Outstanding'),
39
+ (0.80, 'A', 'Excellent'),
40
+ (0.70, 'B+', 'Very Good'),
41
+ (0.60, 'B', 'Good'),
42
+ (0.50, 'C', 'Average'),
43
+ (0.40, 'D', 'Below Average'),
44
+ (0.00, 'F', 'Needs Improvement')
45
+ ]
46
+
47
+ def __init__(self):
48
+ # Role requirements mapping
49
+ self.career_requirements = {
50
+ 'software_engineer': {
51
+ 'required': ['python', 'sql', 'git', 'problem_solving'],
52
+ 'preferred': ['cloud', 'docker', 'system_design']
53
+ },
54
+ 'data_scientist': {
55
+ 'required': ['python', 'sql', 'statistics', 'machine_learning'],
56
+ 'preferred': ['deep_learning', 'spark', 'mlops']
57
+ },
58
+ 'product_manager': {
59
+ 'required': ['communication', 'leadership', 'analytics'],
60
+ 'preferred': ['sql', 'strategic_thinking', 'stakeholder_management']
61
+ },
62
+ 'mechanical_engineer': {
63
+ 'required': ['cad', 'engineering_drawing', 'manufacturing'],
64
+ 'preferred': ['fea', 'cfd', 'automation']
65
+ }
66
+ }
67
+
68
+ def format_student_output(self,
69
+ student_id: str,
70
+ score_packet: Dict[str, Any],
71
+ domain_analysis: Dict[str, Any] = None,
72
+ raw_data: Dict[str, Any] = None) -> Dict[str, Any]:
73
+ """
74
+ Format comprehensive student analysis JSON
75
+
76
+ Args:
77
+ student_id: Student identifier
78
+ score_packet: Output from scoring endpoint
79
+ domain_analysis: Output from FCT (optional)
80
+ raw_data: Original student data (optional)
81
+
82
+ Returns:
83
+ Structured student JSON
84
+ """
85
+ # Extract core scores
86
+ final_score = score_packet.get('final_score', 0)
87
+ grade, grade_desc = self._get_grade(final_score)
88
+
89
+ # Component scores
90
+ component_scores = score_packet.get('scores', {}).get('component_scores', {})
91
+ confidences = score_packet.get('scores', {}).get('confidences', {})
92
+ detailed_features = score_packet.get('detailed_features', {})
93
+
94
+ # Domain info
95
+ detected_domain = (
96
+ domain_analysis.get('detected_domain') if domain_analysis
97
+ else score_packet.get('domain_type', 'general')
98
+ )
99
+
100
+ # Build output
101
+ output = {
102
+ 'student_id': student_id,
103
+ 'generated_at': datetime.utcnow().isoformat() + 'Z',
104
+
105
+ 'summary': {
106
+ 'final_score': round(final_score, 3),
107
+ 'grade': grade,
108
+ 'grade_description': grade_desc,
109
+ 'percentile': score_packet.get('percentile', 50),
110
+ 'placement_ready': final_score >= 0.60
111
+ },
112
+
113
+ 'scores': {
114
+ 'universal': {
115
+ 'score': round(component_scores.get('universal', 0), 3),
116
+ 'confidence': round(confidences.get('universal', 0), 3),
117
+ 'features': detailed_features.get('universal', {})
118
+ },
119
+ 'personality': {
120
+ 'score': round(component_scores.get('personality', 0), 3),
121
+ 'confidence': round(confidences.get('personality', 0), 3),
122
+ 'traits': detailed_features.get('personality', {})
123
+ },
124
+ 'text': {
125
+ 'score': round(component_scores.get('text', 0), 3),
126
+ 'confidence': round(confidences.get('text', 0), 3),
127
+ 'aspects': detailed_features.get('text', {})
128
+ }
129
+ },
130
+
131
+ 'domain_analysis': self._format_domain_analysis(
132
+ detected_domain, domain_analysis, raw_data
133
+ ),
134
+
135
+ 'strengths': self._identify_strengths(detailed_features),
136
+
137
+ 'improvement_areas': self._identify_improvements(detailed_features),
138
+
139
+ 'career_suggestions': self._suggest_careers(
140
+ detected_domain, detailed_features, raw_data
141
+ ),
142
+
143
+ 'skill_recommendations': self._recommend_skills(
144
+ detected_domain, raw_data
145
+ ),
146
+
147
+ 'explanations': score_packet.get('explanations', {})
148
+ }
149
+
150
+ # Add fidelity if available
151
+ if domain_analysis and 'fidelity' in domain_analysis:
152
+ output['fidelity_assessment'] = domain_analysis['fidelity']
153
+
154
+ return output
155
+
156
+ def _get_grade(self, score: float) -> tuple:
157
+ """Get grade and description for score"""
158
+ for threshold, grade, desc in self.GRADE_THRESHOLDS:
159
+ if score >= threshold:
160
+ return (grade, desc)
161
+ return ('F', 'Needs Improvement')
162
+
163
+ def _format_domain_analysis(self, detected_domain: str,
164
+ domain_analysis: Dict,
165
+ raw_data: Dict) -> Dict[str, Any]:
166
+ """Format domain-specific analysis"""
167
+ result = {
168
+ 'detected_domain': detected_domain,
169
+ 'display_name': detected_domain.replace('_', ' ').title()
170
+ }
171
+
172
+ if domain_analysis:
173
+ result['domain_confidence'] = domain_analysis.get('domain_confidence', 0)
174
+ result['aspects'] = domain_analysis.get('aspects', {})
175
+
176
+ # Skill gaps from raw data
177
+ if raw_data and 'skills' in raw_data:
178
+ skills = raw_data.get('skills', [])
179
+ if isinstance(skills, str):
180
+ skills = [s.strip().lower() for s in skills.split(',')]
181
+ result['current_skills'] = skills
182
+
183
+ return result
184
+
185
+ def _identify_strengths(self, features: Dict) -> List[Dict]:
186
+ """Identify top strengths from features"""
187
+ strengths = []
188
+
189
+ # Universal features
190
+ universal = features.get('universal', {})
191
+ if universal.get('cgpa_norm', 0) > 0.8:
192
+ strengths.append({
193
+ 'area': 'Academic Excellence',
194
+ 'score': universal['cgpa_norm'],
195
+ 'description': 'Strong academic performance with high CGPA'
196
+ })
197
+
198
+ if universal.get('internship_exposure', 0) > 0.7:
199
+ strengths.append({
200
+ 'area': 'Industry Experience',
201
+ 'score': universal['internship_exposure'],
202
+ 'description': 'Significant practical experience through internships'
203
+ })
204
+
205
+ # Personality traits
206
+ personality = features.get('personality', {})
207
+ for trait, score in personality.items():
208
+ if score > 0.75:
209
+ strengths.append({
210
+ 'area': trait.title(),
211
+ 'score': score,
212
+ 'description': self._get_trait_description(trait, 'high')
213
+ })
214
+
215
+ # Text aspects
216
+ text = features.get('text', {})
217
+ if text.get('leadership_score', 0) > 0.7:
218
+ strengths.append({
219
+ 'area': 'Leadership',
220
+ 'score': text['leadership_score'],
221
+ 'description': 'Demonstrated leadership abilities with concrete examples'
222
+ })
223
+
224
+ if text.get('technical_skills', 0) > 0.7:
225
+ strengths.append({
226
+ 'area': 'Technical Skills',
227
+ 'score': text['technical_skills'],
228
+ 'description': 'Strong technical competencies'
229
+ })
230
+
231
+ # Sort by score and return top 5
232
+ strengths.sort(key=lambda x: x['score'], reverse=True)
233
+ return strengths[:5]
234
+
235
+ def _identify_improvements(self, features: Dict) -> List[Dict]:
236
+ """Identify areas needing improvement"""
237
+ improvements = []
238
+
239
+ # Universal features
240
+ universal = features.get('universal', {})
241
+ if universal.get('ec_quality', 0) < 0.4:
242
+ improvements.append({
243
+ 'area': 'Extracurricular Activities',
244
+ 'current_score': universal.get('ec_quality', 0),
245
+ 'suggestion': 'Join clubs, participate in competitions, or take leadership roles'
246
+ })
247
+
248
+ if universal.get('cert_quality', 0) < 0.4:
249
+ improvements.append({
250
+ 'area': 'Professional Certifications',
251
+ 'current_score': universal.get('cert_quality', 0),
252
+ 'suggestion': 'Pursue industry-recognized certifications in your domain'
253
+ })
254
+
255
+ # Text aspects
256
+ text = features.get('text', {})
257
+ if text.get('communication', 0) < 0.5:
258
+ improvements.append({
259
+ 'area': 'Communication Skills',
260
+ 'current_score': text.get('communication', 0),
261
+ 'suggestion': 'Practice public speaking, write detailed project documentation'
262
+ })
263
+
264
+ if text.get('career_alignment', 0) < 0.5:
265
+ improvements.append({
266
+ 'area': 'Career Clarity',
267
+ 'current_score': text.get('career_alignment', 0),
268
+ 'suggestion': 'Define clear short-term and long-term career goals'
269
+ })
270
+
271
+ # Sort by score (lowest first)
272
+ improvements.sort(key=lambda x: x['current_score'])
273
+ return improvements[:4]
274
+
275
+ def _suggest_careers(self, domain: str, features: Dict,
276
+ raw_data: Dict) -> List[Dict]:
277
+ """Suggest career paths based on profile"""
278
+ suggestions = []
279
+
280
+ # Get student skills
281
+ skills = []
282
+ if raw_data and 'skills' in raw_data:
283
+ skills_raw = raw_data.get('skills', [])
284
+ if isinstance(skills_raw, str):
285
+ skills = [s.strip().lower() for s in skills_raw.split(',')]
286
+ else:
287
+ skills = [s.lower() for s in skills_raw]
288
+
289
+ # Text features for soft skills
290
+ text = features.get('text', {})
291
+
292
+ for role, reqs in self.career_requirements.items():
293
+ # Calculate fit score
294
+ required_met = sum(1 for r in reqs['required']
295
+ if r in skills or self._has_soft_skill(r, text))
296
+ preferred_met = sum(1 for p in reqs['preferred']
297
+ if p in skills or self._has_soft_skill(p, text))
298
+
299
+ total_reqs = len(reqs['required'])
300
+ fit_score = (required_met / total_reqs) if total_reqs else 0
301
+ fit_score += (preferred_met / len(reqs['preferred'])) * 0.3 if reqs['preferred'] else 0
302
+ fit_score = min(fit_score, 1.0)
303
+
304
+ if fit_score > 0.3: # Minimum threshold
305
+ suggestions.append({
306
+ 'role': role.replace('_', ' ').title(),
307
+ 'fit_score': round(fit_score, 2),
308
+ 'requirements_met': [r for r in reqs['required']
309
+ if r in skills or self._has_soft_skill(r, text)],
310
+ 'requirements_gap': [r for r in reqs['required']
311
+ if r not in skills and not self._has_soft_skill(r, text)]
312
+ })
313
+
314
+ # Sort by fit score
315
+ suggestions.sort(key=lambda x: x['fit_score'], reverse=True)
316
+ return suggestions[:3]
317
+
318
+ def _has_soft_skill(self, skill: str, text_features: Dict) -> bool:
319
+ """Check if student has a soft skill based on text analysis"""
320
+ skill_mapping = {
321
+ 'communication': 'communication',
322
+ 'leadership': 'leadership_score',
323
+ 'problem_solving': 'problem_solving',
324
+ 'teamwork': 'teamwork'
325
+ }
326
+
327
+ if skill in skill_mapping:
328
+ return text_features.get(skill_mapping[skill], 0) > 0.6
329
+ return False
330
+
331
+ def _recommend_skills(self, domain: str, raw_data: Dict) -> List[Dict]:
332
+ """Recommend skills to acquire"""
333
+ recommendations = []
334
+
335
+ # Domain-specific recommendations
336
+ domain_skills = {
337
+ 'software_engineering': [
338
+ {'skill': 'cloud', 'courses': ['AWS Solutions Architect', 'GCP Fundamentals']},
339
+ {'skill': 'system_design', 'courses': ['Grokking System Design']},
340
+ {'skill': 'devops', 'courses': ['Docker Mastery', 'Kubernetes']}
341
+ ],
342
+ 'data_science': [
343
+ {'skill': 'deep_learning', 'courses': ['Deep Learning Specialization']},
344
+ {'skill': 'mlops', 'courses': ['MLOps for Production']},
345
+ {'skill': 'statistics', 'courses': ['Statistics with Python']}
346
+ ],
347
+ 'mechanical_engineering': [
348
+ {'skill': 'ev_powertrain', 'courses': ['Electric Vehicle Technology']},
349
+ {'skill': 'automation', 'courses': ['Industrial Automation', 'PLC']}
350
+ ]
351
+ }
352
+
353
+ # Get current skills
354
+ current_skills = []
355
+ if raw_data and 'skills' in raw_data:
356
+ skills_raw = raw_data.get('skills', [])
357
+ if isinstance(skills_raw, str):
358
+ current_skills = [s.strip().lower() for s in skills_raw.split(',')]
359
+
360
+ # Recommend missing skills
361
+ domain_recs = domain_skills.get(domain, domain_skills.get('software_engineering', []))
362
+
363
+ for rec in domain_recs:
364
+ if rec['skill'] not in current_skills:
365
+ recommendations.append({
366
+ 'skill': rec['skill'].replace('_', ' ').title(),
367
+ 'priority': 'high',
368
+ 'recommended_courses': rec['courses'],
369
+ 'certifications': []
370
+ })
371
+
372
+ return recommendations[:4]
373
+
374
+ def _get_trait_description(self, trait: str, level: str) -> str:
375
+ """Get description for personality trait"""
376
+ descriptions = {
377
+ 'openness': {
378
+ 'high': 'Creative, curious, and open to new experiences',
379
+ 'low': 'Practical and focused on concrete tasks'
380
+ },
381
+ 'conscientiousness': {
382
+ 'high': 'Organized, disciplined, and reliable',
383
+ 'low': 'Flexible and adaptable to changing situations'
384
+ },
385
+ 'extraversion': {
386
+ 'high': 'Energetic, sociable, and thrives in team settings',
387
+ 'low': 'Focused, reflective, and excels in independent work'
388
+ },
389
+ 'agreeableness': {
390
+ 'high': 'Cooperative, empathetic, and team-oriented',
391
+ 'low': 'Independent thinker, comfortable with competition'
392
+ },
393
+ 'stability': {
394
+ 'high': 'Emotionally resilient and handles stress well',
395
+ 'low': 'Sensitive and responsive to feedback'
396
+ }
397
+ }
398
+
399
+ return descriptions.get(trait, {}).get(level, f"Strong {trait}")
400
+
401
+
402
+ # Singleton
403
+ _student_output_service: Optional[StudentOutputService] = None
404
+
405
+
406
+ def get_student_output_service() -> StudentOutputService:
407
+ """Get singleton student output service"""
408
+ global _student_output_service
409
+ if _student_output_service is None:
410
+ _student_output_service = StudentOutputService()
411
+ return _student_output_service
services/text_module.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text Embeddings Module - NLP-based Scoring"""
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from typing import Dict, Tuple
5
+ import re
6
+
7
+ class TextModule:
8
+ """Scores text responses using SBERT embeddings and heuristics"""
9
+
10
+ def __init__(self):
11
+ # Load SBERT model
12
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ # Reference embeddings for ideal responses
15
+ self.reference_embeddings = {
16
+ 'strengths': self.model.encode([
17
+ "I have strong technical skills in programming, problem-solving, and software development",
18
+ "My strengths include leadership, communication, and analytical thinking",
19
+ "I excel at teamwork, project management, and innovative solutions"
20
+ ]),
21
+ 'career': self.model.encode([
22
+ "I am interested in software engineering and technology innovation",
23
+ "I want to work in data science and machine learning",
24
+ "My goal is to become a product manager and lead technical teams"
25
+ ])
26
+ }
27
+
28
+ # Leadership keywords
29
+ self.leadership_keywords = [
30
+ 'lead', 'leader', 'leadership', 'managed', 'organized', 'president',
31
+ 'head', 'coordinator', 'captain', 'founded', 'initiated', 'directed'
32
+ ]
33
+
34
+ def score(self, text_responses: Dict[str, str]) -> Tuple[float, float, Dict]:
35
+ """
36
+ Calculate text score from 3 textual responses
37
+ Returns: (score, confidence, features)
38
+ """
39
+ features = {}
40
+
41
+ text_q1 = text_responses.get('text_q1', '')
42
+ text_q2 = text_responses.get('text_q2', '')
43
+ text_q3 = text_responses.get('text_q3', '')
44
+
45
+ # Feature 1: Writing quality (text_q1 - strengths)
46
+ features['writing_quality'] = self._assess_writing_quality(text_q1)
47
+
48
+ # Feature 2: Intent coherence (text_q2 - career interests)
49
+ features['intent_coherence'] = self._assess_intent_coherence(text_q2)
50
+
51
+ # Feature 3: Leadership flag (text_q3 - extracurriculars)
52
+ features['leadership_score'] = self._assess_leadership(text_q3)
53
+
54
+ # Feature 4: Content depth (all responses)
55
+ features['content_depth'] = self._assess_content_depth(text_q1, text_q2, text_q3)
56
+
57
+ # Calculate overall text score
58
+ text_score = (
59
+ features['writing_quality'] * 0.25 +
60
+ features['intent_coherence'] * 0.25 +
61
+ features['leadership_score'] * 0.30 +
62
+ features['content_depth'] * 0.20
63
+ )
64
+
65
+ # Calculate confidence based on response completeness
66
+ confidence = self._calculate_confidence(text_q1, text_q2, text_q3)
67
+
68
+ return text_score, confidence, features
69
+
70
+ def _assess_writing_quality(self, text: str) -> float:
71
+ """Assess writing quality using heuristics"""
72
+ if not text or len(text) < 50:
73
+ return 0.2
74
+
75
+ score = 0.5 # Base score
76
+
77
+ # Length check (150-300 words ideal)
78
+ word_count = len(text.split())
79
+ if 150 <= word_count <= 300:
80
+ score += 0.3
81
+ elif 100 <= word_count < 150 or 300 < word_count <= 400:
82
+ score += 0.2
83
+ else:
84
+ score += 0.1
85
+
86
+ # Sentence structure (multiple sentences)
87
+ sentences = re.split(r'[.!?]+', text)
88
+ if len(sentences) >= 5:
89
+ score += 0.1
90
+
91
+ # Proper capitalization
92
+ if text[0].isupper():
93
+ score += 0.05
94
+
95
+ # No excessive repetition
96
+ words = text.lower().split()
97
+ unique_ratio = len(set(words)) / len(words) if words else 0
98
+ if unique_ratio > 0.6:
99
+ score += 0.05
100
+
101
+ return min(score, 1.0)
102
+
103
+ def _assess_intent_coherence(self, text: str) -> float:
104
+ """Assess career intent coherence using embeddings"""
105
+ if not text or len(text) < 50:
106
+ return 0.2
107
+
108
+ # Encode the response
109
+ response_embedding = self.model.encode([text])[0]
110
+
111
+ # Calculate similarity with reference career embeddings
112
+ similarities = []
113
+ for ref_emb in self.reference_embeddings['career']:
114
+ similarity = np.dot(response_embedding, ref_emb) / (
115
+ np.linalg.norm(response_embedding) * np.linalg.norm(ref_emb)
116
+ )
117
+ similarities.append(similarity)
118
+
119
+ # Take max similarity
120
+ max_similarity = max(similarities) if similarities else 0
121
+
122
+ # Normalize to 0-1 (cosine similarity is -1 to 1)
123
+ score = (max_similarity + 1) / 2
124
+
125
+ return score
126
+
127
+ def _assess_leadership(self, text: str) -> float:
128
+ """Assess leadership based on keywords"""
129
+ if not text or len(text) < 50:
130
+ return 0.2
131
+
132
+ text_lower = text.lower()
133
+
134
+ # Count leadership keywords
135
+ keyword_count = sum(1 for keyword in self.leadership_keywords if keyword in text_lower)
136
+
137
+ # Base score on keyword presence
138
+ if keyword_count >= 3:
139
+ score = 1.0
140
+ elif keyword_count == 2:
141
+ score = 0.8
142
+ elif keyword_count == 1:
143
+ score = 0.6
144
+ else:
145
+ score = 0.3
146
+
147
+ # Bonus for specific leadership phrases
148
+ if 'led a team' in text_lower or 'team lead' in text_lower:
149
+ score = min(score + 0.1, 1.0)
150
+
151
+ return score
152
+
153
+ def _assess_content_depth(self, text_q1: str, text_q2: str, text_q3: str) -> float:
154
+ """Assess overall content depth"""
155
+ total_words = len(text_q1.split()) + len(text_q2.split()) + len(text_q3.split())
156
+
157
+ if total_words >= 450: # 150+ words each
158
+ return 1.0
159
+ elif total_words >= 300:
160
+ return 0.8
161
+ elif total_words >= 200:
162
+ return 0.6
163
+ elif total_words >= 100:
164
+ return 0.4
165
+ else:
166
+ return 0.2
167
+
168
+ def _calculate_confidence(self, text_q1: str, text_q2: str, text_q3: str) -> float:
169
+ """Calculate confidence based on completeness"""
170
+ scores = []
171
+
172
+ for text in [text_q1, text_q2, text_q3]:
173
+ if not text:
174
+ scores.append(0)
175
+ elif len(text) < 50:
176
+ scores.append(0.3)
177
+ elif len(text) < 100:
178
+ scores.append(0.6)
179
+ else:
180
+ scores.append(1.0)
181
+
182
+ return np.mean(scores)
183
+
184
+ def explain(self, features: Dict) -> Dict:
185
+ """Generate explanation for text scores"""
186
+ explanations = {
187
+ 'highlights': [],
188
+ 'suggestions': []
189
+ }
190
+
191
+ # Highlights
192
+ if features.get('writing_quality', 0) > 0.7:
193
+ explanations['highlights'].append("Strong writing quality with clear communication")
194
+
195
+ if features.get('leadership_score', 0) > 0.7:
196
+ explanations['highlights'].append("Demonstrated leadership experience and initiative")
197
+
198
+ if features.get('intent_coherence', 0) > 0.7:
199
+ explanations['highlights'].append("Clear and coherent career goals")
200
+
201
+ # Suggestions
202
+ if features.get('writing_quality', 0) < 0.5:
203
+ explanations['suggestions'].append("Provide more detailed responses (aim for 150-300 words each)")
204
+
205
+ if features.get('leadership_score', 0) < 0.5:
206
+ explanations['suggestions'].append("Highlight specific leadership roles and their impact")
207
+
208
+ if features.get('content_depth', 0) < 0.5:
209
+ explanations['suggestions'].append("Include more specific examples and achievements")
210
+
211
+ return explanations
services/text_module_v2.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text Embeddings Module V2 - Aspect-based Prototype Extraction"""
2
+ import os
3
+ import json
4
+ import logging
5
+ import numpy as np
6
+ from datetime import datetime
7
+ from typing import Dict, Tuple, List, Optional
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Default aspect seeds (built-in fallback)
13
+ DEFAULT_ASPECT_SEEDS = {
14
+ "leadership": [
15
+ "led a team", "was team lead", "managed a project", "supervised interns",
16
+ "coordinated a cross-functional team", "organized the club", "president of the society",
17
+ "captain of the team", "ran weekly standups", "delegated tasks", "mentored junior members",
18
+ "headed the project", "oversaw project timelines", "chaired the committee",
19
+ "led end-to-end delivery", "directed project milestones", "led a 5-person team",
20
+ "managed stakeholders", "took ownership of the initiative", "led code reviews",
21
+ "organized campus events", "led product demo sessions", "led recruitment for volunteers",
22
+ "managed vendor relationships", "spearheaded the outreach program"
23
+ ],
24
+ "technical_skills": [
25
+ "developed a web API", "implemented RESTful services", "coded in python",
26
+ "built machine learning models", "trained neural networks", "implemented data pipelines",
27
+ "used pandas for ETL", "designed database schemas", "built microservices",
28
+ "deployed models using docker", "worked with FastAPI", "implemented CI/CD",
29
+ "wrote unit tests", "optimized SQL queries", "used scikit-learn",
30
+ "developed recommendation systems", "built feature engineering pipelines",
31
+ "deployed to cloud", "developed ETL jobs", "worked with Kafka",
32
+ "implemented caching layers", "used TensorFlow or PyTorch", "built backend services",
33
+ "wrote production-grade code", "integrated third-party APIs"
34
+ ],
35
+ "problem_solving": [
36
+ "solved complex problem", "debugged production issues", "optimized an algorithm",
37
+ "reduced latency of service", "designed a scalable solution", "investigated root cause",
38
+ "improved system reliability", "created a novel solution", "troubleshot integration issues",
39
+ "automated manual tasks", "reduced memory usage", "resolved data pipeline failures",
40
+ "refactored critical code", "handled edge cases", "iterated on prototypes",
41
+ "performed A/B testing to decide", "diagnosed performance bottlenecks",
42
+ "designed fallback strategies", "resolved deployment failures", "created monitoring & alerts"
43
+ ],
44
+ "internships_experience": [
45
+ "summer internship", "industrial training", "interned at", "worked as an intern",
46
+ "internship project", "internship in data science", "interned at a startup",
47
+ "completed internship at", "interned with the engineering team", "intern experience",
48
+ "interned at an e-commerce company", "industrial internship", "co-op placement",
49
+ "paid internship", "research internship", "interned as a software engineer",
50
+ "on-the-job training", "worked under mentor", "internship-driven project",
51
+ "corporate internship"
52
+ ],
53
+ "communication": [
54
+ "presented to stakeholders", "gave a presentation", "wrote documentation",
55
+ "authored reports", "explained results to non-technical", "public speaking",
56
+ "delivered demo", "prepared slides", "wrote user guides", "communicated with clients",
57
+ "collaborated across teams", "conducted knowledge transfer", "wrote clear emails",
58
+ "explained technical concepts", "presented project outcomes", "led demo sessions",
59
+ "created onboarding docs", "contributed to team discussions", "led workshops",
60
+ "hosted training sessions"
61
+ ],
62
+ "teamwork": [
63
+ "collaborated with team", "worked in a cross-functional team", "paired programming",
64
+ "contributed to group project", "supported teammates", "collaborated on design",
65
+ "worked with designers and PMs", "helped teammates debug", "co-authored project",
66
+ "mentored peers", "shared responsibilities", "worked effectively in group",
67
+ "contributed in agile team", "participated in sprints", "assisted in integration"
68
+ ],
69
+ "project_execution": [
70
+ "delivered project on time", "met project deadlines", "managed milestones",
71
+ "handled project planning", "released production features", "coordinated deployment",
72
+ "delivered MVP", "tracked KPIs", "managed scope", "created project timeline",
73
+ "ran retrospectives", "managed feature rollout", "ensured on-time delivery",
74
+ "performed release validations", "deployed analytics dashboard", "iterated based on feedback"
75
+ ],
76
+ "initiative": [
77
+ "initiated a project", "proposed a new idea", "took initiative", "started a side project",
78
+ "built a proof of concept", "started a campus chapter", "created an automation",
79
+ "improved an existing process", "volunteered to lead", "identified improvement areas",
80
+ "launched a mini-product", "ran a pilot program", "created onboarding scripts",
81
+ "led process improvements", "started a mentoring circle"
82
+ ],
83
+ "learning_agility": [
84
+ "quick learner", "self-taught", "learned new framework", "picked up new language",
85
+ "adapted to new tech", "completed online courses", "upskilled via projects",
86
+ "transitioned domains", "learned on the job", "rapidly onboarded", "attended workshops",
87
+ "completed bootcamp", "took certification courses", "learned through documentation",
88
+ "scaled knowledge quickly", "adapted to changing scope"
89
+ ],
90
+ "career_alignment": [
91
+ "career goal is", "aspire to become", "interested in data science",
92
+ "pursue a role in product", "long-term goal", "want to specialize in",
93
+ "career objective", "planning to pursue masters", "aim to work in industry",
94
+ "seek product management roles", "interested in research", "want to join a startup",
95
+ "targeting roles in ML engineering", "aiming for consulting roles",
96
+ "career path is focused on"
97
+ ]
98
+ }
99
+
100
+ # Question to aspects mapping
101
+ QUESTION_ASPECT_MAP = {
102
+ "text_q1": ["technical_skills", "problem_solving", "learning_agility", "initiative", "communication"],
103
+ "text_q2": ["career_alignment", "learning_agility", "initiative", "communication"],
104
+ "text_q3": ["leadership", "teamwork", "project_execution", "internships_experience", "communication"]
105
+ }
106
+
107
+
108
+ class TextModuleV2:
109
+ """Enhanced text scoring using aspect-based prototypes with all-mpnet-base-v2"""
110
+
111
+ def __init__(self, model_name: str = None, seeds_path: str = "./aspect_seeds.json",
112
+ centroids_path: str = "./aspect_centroids.npz"):
113
+ # Config: allow model override via env or param
114
+ self.model_name = model_name or os.getenv('ASPECT_MODEL_NAME', 'all-mpnet-base-v2')
115
+ self.seeds_path = seeds_path
116
+ self.centroids_path = centroids_path
117
+
118
+ # Load model
119
+ logger.info(f"Loading sentence transformer model: {self.model_name}")
120
+ self.model = SentenceTransformer(self.model_name, device='cpu')
121
+
122
+ # Load seeds
123
+ self.aspect_seeds = self._load_seeds()
124
+
125
+ # Load or build centroids
126
+ self.centroids = self._load_or_build_centroids()
127
+
128
+ logger.info(f"TextModuleV2 initialized with {len(self.aspect_seeds)} aspects")
129
+
130
+ def _load_seeds(self) -> Dict[str, List[str]]:
131
+ """Load aspect seeds from JSON or use defaults"""
132
+ if os.path.exists(self.seeds_path):
133
+ try:
134
+ with open(self.seeds_path, 'r', encoding='utf-8') as f:
135
+ seeds = json.load(f)
136
+ logger.info(f"Loaded aspect seeds from {self.seeds_path}")
137
+ return seeds
138
+ except Exception as e:
139
+ logger.warning(f"Failed to load seeds from {self.seeds_path}: {e}. Using defaults.")
140
+ return DEFAULT_ASPECT_SEEDS.copy()
141
+
142
+ def _load_or_build_centroids(self) -> Dict[str, np.ndarray]:
143
+ """Load cached centroids or build from seeds"""
144
+ if os.path.exists(self.centroids_path):
145
+ try:
146
+ data = np.load(self.centroids_path)
147
+ centroids = {key: data[key] for key in data.files}
148
+ logger.info(f"Loaded centroids from {self.centroids_path}")
149
+ return centroids
150
+ except Exception as e:
151
+ logger.warning(f"Failed to load centroids: {e}. Rebuilding.")
152
+
153
+ return self.build_prototypes(self.aspect_seeds, self.model)
154
+
155
+ def build_prototypes(self, aspect_seeds: Dict[str, List[str]],
156
+ model: SentenceTransformer) -> Dict[str, np.ndarray]:
157
+ """Build centroid prototypes from seed phrases"""
158
+ logger.info("Building aspect centroids...")
159
+ centroids = {}
160
+
161
+ for aspect, seeds in aspect_seeds.items():
162
+ if not seeds:
163
+ logger.warning(f"Aspect '{aspect}' has no seeds, skipping")
164
+ continue
165
+
166
+ # Encode seeds (CPU, convert_to_tensor=False)
167
+ embeddings = model.encode(seeds, convert_to_tensor=False, show_progress_bar=False)
168
+ embeddings = np.array(embeddings, dtype=np.float32)
169
+
170
+ # Compute centroid
171
+ centroid = np.mean(embeddings, axis=0)
172
+ centroid = centroid / np.linalg.norm(centroid) # Normalize
173
+ centroids[aspect] = centroid
174
+
175
+ # Save centroids
176
+ try:
177
+ np.savez(self.centroids_path, **centroids)
178
+ logger.info(f"Saved centroids to {self.centroids_path}")
179
+ except Exception as e:
180
+ logger.error(f"Failed to save centroids: {e}")
181
+
182
+ return centroids
183
+
184
+ def score_text_aspects(self, text: str, centroids: Dict[str, np.ndarray],
185
+ top_k: int = 3) -> Tuple[Dict[str, float], Dict[str, List[str]], float]:
186
+ """
187
+ Score text against aspect centroids
188
+ Returns: (aspect_scores, chunk_assignments, confidence)
189
+ """
190
+ if not text or len(text) < 20:
191
+ return {}, {}, 0.0
192
+
193
+ # Split into chunks (sentences or 50-word windows)
194
+ chunks = self._split_text(text)
195
+ if not chunks:
196
+ return {}, {}, 0.0
197
+
198
+ # Encode chunks
199
+ chunk_embeddings = self.model.encode(chunks, convert_to_tensor=False, show_progress_bar=False)
200
+ chunk_embeddings = np.array(chunk_embeddings, dtype=np.float32)
201
+
202
+ # Score each aspect
203
+ aspect_scores = {}
204
+ chunk_assignments = {aspect: [] for aspect in centroids.keys()}
205
+
206
+ for aspect, centroid in centroids.items():
207
+ # Compute cosine similarities
208
+ sims = np.dot(chunk_embeddings, centroid) / (
209
+ np.linalg.norm(chunk_embeddings, axis=1) * np.linalg.norm(centroid) + 1e-8
210
+ )
211
+
212
+ # Scoring formula: 0.6 * max_sim + 0.4 * mean_topk
213
+ max_sim = np.max(sims)
214
+ topk_sims = np.partition(sims, -min(top_k, len(sims)))[-top_k:]
215
+ mean_topk = np.mean(topk_sims)
216
+
217
+ # Map from [-1,1] to [0,1]
218
+ raw_score = 0.6 * max_sim + 0.4 * mean_topk
219
+ normalized_score = (raw_score + 1) / 2
220
+ aspect_scores[aspect] = float(np.clip(normalized_score, 0, 1))
221
+
222
+ # Assign chunks with sim > threshold
223
+ threshold = 0.3
224
+ for i, sim in enumerate(sims):
225
+ if sim > threshold:
226
+ chunk_assignments[aspect].append(chunks[i])
227
+
228
+ # Calculate confidence
229
+ confidence = self._calculate_aspect_confidence(text, aspect_scores)
230
+
231
+ return aspect_scores, chunk_assignments, confidence
232
+
233
+ def _split_text(self, text: str) -> List[str]:
234
+ """Split text into scorable chunks"""
235
+ import re
236
+ # Split by sentences
237
+ sentences = re.split(r'[.!?]+', text)
238
+ chunks = [s.strip() for s in sentences if len(s.strip()) > 20]
239
+
240
+ # If too few sentences, use sliding window
241
+ if len(chunks) < 3:
242
+ words = text.split()
243
+ window_size = 50
244
+ step = 25
245
+ chunks = []
246
+ for i in range(0, max(1, len(words) - window_size + 1), step):
247
+ chunk = ' '.join(words[i:i+window_size])
248
+ if len(chunk) > 20:
249
+ chunks.append(chunk)
250
+
251
+ return chunks[:20] # Limit to 20 chunks
252
+
253
+ def _calculate_aspect_confidence(self, text: str, aspect_scores: Dict[str, float]) -> float:
254
+ """Calculate confidence based on text quality and score distribution"""
255
+ if not aspect_scores:
256
+ return 0.0
257
+
258
+ # Text length factor
259
+ word_count = len(text.split())
260
+ length_factor = min(word_count / 150, 1.0)
261
+
262
+ # Score variance factor (higher variance = more confident signal)
263
+ scores = list(aspect_scores.values())
264
+ score_std = np.std(scores)
265
+ variance_factor = min(score_std * 2, 1.0)
266
+
267
+ # Max score factor
268
+ max_score = max(scores)
269
+
270
+ confidence = 0.4 * length_factor + 0.3 * variance_factor + 0.3 * max_score
271
+ return float(np.clip(confidence, 0, 1))
272
+
273
+ def score(self, text_responses: Dict[str, str]) -> Tuple[float, float, Dict]:
274
+ """
275
+ Main scoring function - backward compatible interface
276
+ Returns: (score, confidence, features)
277
+ """
278
+ text_q1 = text_responses.get('text_q1', '')
279
+ text_q2 = text_responses.get('text_q2', '')
280
+ text_q3 = text_responses.get('text_q3', '')
281
+
282
+ # Score each question with relevant aspects
283
+ q1_aspects = QUESTION_ASPECT_MAP['text_q1']
284
+ q2_aspects = QUESTION_ASPECT_MAP['text_q2']
285
+ q3_aspects = QUESTION_ASPECT_MAP['text_q3']
286
+
287
+ q1_centroids = {k: self.centroids[k] for k in q1_aspects if k in self.centroids}
288
+ q2_centroids = {k: self.centroids[k] for k in q2_aspects if k in self.centroids}
289
+ q3_centroids = {k: self.centroids[k] for k in q3_aspects if k in self.centroids}
290
+
291
+ q1_scores, _, q1_conf = self.score_text_aspects(text_q1, q1_centroids)
292
+ q2_scores, _, q2_conf = self.score_text_aspects(text_q2, q2_centroids)
293
+ q3_scores, _, q3_conf = self.score_text_aspects(text_q3, q3_centroids)
294
+
295
+ # Aggregate features
296
+ features = {}
297
+
298
+ # Technical skills from Q1
299
+ features['technical_skills'] = q1_scores.get('technical_skills', 0.3)
300
+ features['problem_solving'] = q1_scores.get('problem_solving', 0.3)
301
+
302
+ # Career alignment from Q2
303
+ features['career_alignment'] = q2_scores.get('career_alignment', 0.3)
304
+ features['learning_agility'] = max(
305
+ q1_scores.get('learning_agility', 0.3),
306
+ q2_scores.get('learning_agility', 0.3)
307
+ )
308
+
309
+ # Leadership from Q3
310
+ features['leadership_score'] = q3_scores.get('leadership', 0.3)
311
+ features['teamwork'] = q3_scores.get('teamwork', 0.3)
312
+ features['internships_experience'] = q3_scores.get('internships_experience', 0.3)
313
+
314
+ # Communication (averaged across all)
315
+ comm_scores = [
316
+ q1_scores.get('communication', 0.3),
317
+ q2_scores.get('communication', 0.3),
318
+ q3_scores.get('communication', 0.3)
319
+ ]
320
+ features['communication'] = np.mean(comm_scores)
321
+
322
+ # Writing quality (heuristic)
323
+ features['writing_quality'] = self._assess_writing_quality(text_q1)
324
+
325
+ # Content depth
326
+ features['content_depth'] = self._assess_content_depth(text_q1, text_q2, text_q3)
327
+
328
+ # Calculate overall score (weighted combination)
329
+ text_score = (
330
+ features['technical_skills'] * 0.15 +
331
+ features['problem_solving'] * 0.10 +
332
+ features['leadership_score'] * 0.20 +
333
+ features['career_alignment'] * 0.10 +
334
+ features['communication'] * 0.15 +
335
+ features['teamwork'] * 0.10 +
336
+ features['learning_agility'] * 0.10 +
337
+ features['content_depth'] * 0.10
338
+ )
339
+
340
+ # Overall confidence
341
+ confidence = np.mean([q1_conf, q2_conf, q3_conf])
342
+
343
+ return text_score, confidence, features
344
+
345
+ def _assess_writing_quality(self, text: str) -> float:
346
+ """Heuristic writing quality assessment"""
347
+ if not text or len(text) < 50:
348
+ return 0.2
349
+
350
+ score = 0.5
351
+ word_count = len(text.split())
352
+
353
+ if 150 <= word_count <= 300:
354
+ score += 0.3
355
+ elif 100 <= word_count < 150 or 300 < word_count <= 400:
356
+ score += 0.2
357
+ else:
358
+ score += 0.1
359
+
360
+ import re
361
+ sentences = re.split(r'[.!?]+', text)
362
+ if len(sentences) >= 5:
363
+ score += 0.1
364
+
365
+ if text[0].isupper():
366
+ score += 0.05
367
+
368
+ words = text.lower().split()
369
+ unique_ratio = len(set(words)) / len(words) if words else 0
370
+ if unique_ratio > 0.6:
371
+ score += 0.05
372
+
373
+ return min(score, 1.0)
374
+
375
+ def _assess_content_depth(self, text_q1: str, text_q2: str, text_q3: str) -> float:
376
+ """Assess content depth"""
377
+ total_words = len(text_q1.split()) + len(text_q2.split()) + len(text_q3.split())
378
+
379
+ if total_words >= 450:
380
+ return 1.0
381
+ elif total_words >= 300:
382
+ return 0.8
383
+ elif total_words >= 200:
384
+ return 0.6
385
+ elif total_words >= 100:
386
+ return 0.4
387
+ else:
388
+ return 0.2
389
+
390
+ def explain(self, features: Dict) -> Dict:
391
+ """Generate explanations"""
392
+ explanations = {
393
+ 'highlights': [],
394
+ 'suggestions': []
395
+ }
396
+
397
+ if features.get('technical_skills', 0) > 0.7:
398
+ explanations['highlights'].append("Strong technical skills demonstrated")
399
+
400
+ if features.get('leadership_score', 0) > 0.7:
401
+ explanations['highlights'].append("Clear leadership experience")
402
+
403
+ if features.get('career_alignment', 0) > 0.7:
404
+ explanations['highlights'].append("Well-defined career goals")
405
+
406
+ if features.get('communication', 0) > 0.7:
407
+ explanations['highlights'].append("Excellent communication skills")
408
+
409
+ if features.get('writing_quality', 0) < 0.5:
410
+ explanations['suggestions'].append("Provide more detailed responses (150-300 words each)")
411
+
412
+ if features.get('leadership_score', 0) < 0.5:
413
+ explanations['suggestions'].append("Highlight leadership roles with specific examples")
414
+
415
+ if features.get('technical_skills', 0) < 0.5:
416
+ explanations['suggestions'].append("Describe technical projects and skills in detail")
417
+
418
+ return explanations
419
+
420
+ # Admin functions
421
+ def get_aspect_seeds(self) -> Dict[str, List[str]]:
422
+ """Return current loaded seeds"""
423
+ return self.aspect_seeds.copy()
424
+
425
+ def update_aspect_seeds(self, new_seeds: Dict[str, List[str]],
426
+ persist: bool = True) -> Dict:
427
+ """
428
+ Update aspect seeds and recompute centroids
429
+ Returns: stats dict
430
+ """
431
+ # Validate
432
+ if not isinstance(new_seeds, dict):
433
+ raise ValueError("new_seeds must be a dict")
434
+
435
+ for key, seeds in new_seeds.items():
436
+ if not isinstance(key, str):
437
+ raise ValueError(f"Aspect key must be string, got {type(key)}")
438
+ if not isinstance(seeds, list) or not seeds:
439
+ raise ValueError(f"Seeds for '{key}' must be non-empty list")
440
+ if not all(isinstance(s, str) for s in seeds):
441
+ raise ValueError(f"All seeds for '{key}' must be strings")
442
+
443
+ # Update seeds
444
+ self.aspect_seeds = new_seeds.copy()
445
+
446
+ # Recompute centroids
447
+ logger.info("Recomputing centroids after seed update")
448
+ self.centroids = self.build_prototypes(self.aspect_seeds, self.model)
449
+
450
+ # Persist
451
+ if persist:
452
+ try:
453
+ with open(self.seeds_path, 'w', encoding='utf-8') as f:
454
+ json.dump(new_seeds, f, indent=2, ensure_ascii=False)
455
+ logger.info(f"Persisted new seeds to {self.seeds_path}")
456
+ except Exception as e:
457
+ logger.error(f"Failed to persist seeds: {e}")
458
+
459
+ # Stats
460
+ stats = {
461
+ "num_aspects": len(new_seeds),
462
+ "avg_seed_count": np.mean([len(seeds) for seeds in new_seeds.values()]),
463
+ "timestamp": datetime.utcnow().isoformat() + 'Z'
464
+ }
465
+
466
+ logger.info(f"Aspect seeds updated: {stats}")
467
+ return stats
468
+
469
+ def suggest_seed_expansions(self, corpus_texts: List[str], aspect_key: str,
470
+ top_n: int = 20) -> List[str]:
471
+ """
472
+ Suggest seed expansions from corpus
473
+ Uses TF-IDF + cosine similarity for lightweight extraction
474
+ """
475
+ if aspect_key not in self.centroids:
476
+ return []
477
+
478
+ centroid = self.centroids[aspect_key]
479
+
480
+ # Extract candidate phrases from corpus
481
+ from collections import Counter
482
+ import re
483
+
484
+ candidates = []
485
+ for text in corpus_texts[:100]: # Limit corpus
486
+ # Extract 2-5 word n-grams
487
+ words = text.lower().split()
488
+ for n in range(2, 6):
489
+ for i in range(len(words) - n + 1):
490
+ phrase = ' '.join(words[i:i+n])
491
+ if len(phrase) > 10 and not re.search(r'\d{3,}', phrase):
492
+ candidates.append(phrase)
493
+
494
+ # Count frequency
495
+ phrase_counts = Counter(candidates)
496
+ top_candidates = [phrase for phrase, _ in phrase_counts.most_common(200)]
497
+
498
+ if not top_candidates:
499
+ return []
500
+
501
+ # Encode and rank by similarity
502
+ candidate_embeddings = self.model.encode(top_candidates, convert_to_tensor=False,
503
+ show_progress_bar=False)
504
+ candidate_embeddings = np.array(candidate_embeddings, dtype=np.float32)
505
+
506
+ sims = np.dot(candidate_embeddings, centroid) / (
507
+ np.linalg.norm(candidate_embeddings, axis=1) * np.linalg.norm(centroid) + 1e-8
508
+ )
509
+
510
+ # Return top_n
511
+ top_indices = np.argsort(sims)[-top_n:][::-1]
512
+ suggestions = [top_candidates[i] for i in top_indices]
513
+
514
+ return suggestions
515
+
516
+
517
+ def get_relevant_aspects_for_question(question_id: str) -> List[str]:
518
+ """Get relevant aspect keys for a question"""
519
+ return QUESTION_ASPECT_MAP.get(question_id, [])
520
+
521
+
522
+ # Flask admin blueprint
523
+ def register_admin_seed_endpoint(app, text_module: TextModuleV2):
524
+ """Register admin endpoints for seed management"""
525
+ from flask import Blueprint, request, jsonify
526
+
527
+ admin_bp = Blueprint('admin_aspects', __name__, url_prefix='/admin')
528
+
529
+ def check_admin_token():
530
+ token = request.headers.get('X-Admin-Token')
531
+ expected = os.getenv('ADMIN_SEED_TOKEN', 'admin-secret-token')
532
+ if token != expected:
533
+ return jsonify({'error': 'Unauthorized'}), 401
534
+ return None
535
+
536
+ @admin_bp.route('/aspect-seeds', methods=['GET'])
537
+ def get_seeds():
538
+ """Get current aspect seeds"""
539
+ auth_err = check_admin_token()
540
+ if auth_err:
541
+ return auth_err
542
+
543
+ seeds = text_module.get_aspect_seeds()
544
+ return jsonify({
545
+ 'success': True,
546
+ 'seeds': seeds,
547
+ 'num_aspects': len(seeds)
548
+ })
549
+
550
+ @admin_bp.route('/aspect-seeds', methods=['POST'])
551
+ def update_seeds():
552
+ """Update aspect seeds"""
553
+ auth_err = check_admin_token()
554
+ if auth_err:
555
+ return auth_err
556
+
557
+ data = request.json
558
+ new_seeds = data.get('seeds')
559
+ persist = data.get('persist', True)
560
+
561
+ if not new_seeds:
562
+ return jsonify({'error': 'Missing seeds field'}), 400
563
+
564
+ try:
565
+ stats = text_module.update_aspect_seeds(new_seeds, persist=persist)
566
+ return jsonify({
567
+ 'success': True,
568
+ 'message': 'Aspect seeds updated successfully',
569
+ 'stats': stats
570
+ })
571
+ except Exception as e:
572
+ logger.error(f"Failed to update seeds: {e}")
573
+ return jsonify({'error': str(e)}), 400
574
+
575
+ app.register_blueprint(admin_bp)
576
+ logger.info("Registered admin aspect-seed endpoints at /admin/aspect-seeds")
services/universal_module.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Universal Module - Academic & Experience Scoring"""
2
+ import numpy as np
3
+ import re
4
+ from typing import Dict, Tuple
5
+
6
+ class UniversalModule:
7
+ """Scores based on academic performance and experience"""
8
+
9
+ def __init__(self):
10
+ self.feature_weights = {
11
+ 'cgpa_norm': 0.30,
12
+ 'sgpa_trend': 0.15,
13
+ 'sgpa_consistency': 0.10,
14
+ 'marks_consistency': 0.10,
15
+ 'academic_improvement': 0.10,
16
+ 'internship_exposure': 0.10,
17
+ 'ec_quality': 0.08,
18
+ 'cert_quality': 0.07
19
+ }
20
+
21
+ def score(self, student_data: Dict) -> Tuple[float, float, Dict]:
22
+ """
23
+ Calculate universal score
24
+ Returns: (score, confidence, features_dict)
25
+ """
26
+ features = {}
27
+
28
+ # CGPA normalization (0-10 scale)
29
+ cgpa = student_data.get('cgpa', 0)
30
+ features['cgpa_norm'] = min(cgpa / 10.0, 1.0)
31
+
32
+ # SGPA trend (improvement across semesters) - filter out null values
33
+ sgpa_values = []
34
+ for sem_num in range(1, 9):
35
+ sem_val = student_data.get(f'sgpa_sem{sem_num}')
36
+ if sem_val is not None and sem_val > 0: # Ignore null/zero values
37
+ sgpa_values.append(sem_val)
38
+
39
+ if len(sgpa_values) >= 2:
40
+ # Calculate trend from first to last available semester
41
+ trend = (sgpa_values[-1] - sgpa_values[0]) / 10.0 # Normalize
42
+ features['sgpa_trend'] = max(0, min(trend + 0.5, 1.0)) # Center at 0.5
43
+ else:
44
+ features['sgpa_trend'] = 0.5 # Neutral if insufficient data
45
+
46
+ # SGPA consistency (lower std = more consistent = better)
47
+ if len(sgpa_values) >= 3:
48
+ std_dev = np.std(sgpa_values)
49
+ features['sgpa_consistency'] = max(0, 1 - (std_dev / 3.0)) # Inverse relationship
50
+ else:
51
+ features['sgpa_consistency'] = 0.5
52
+
53
+ # Marks consistency across 10th, 12th, CGPA
54
+ tenth = student_data.get('tenth_pct')
55
+ twelfth = student_data.get('twelfth_pct')
56
+
57
+ if tenth and twelfth and cgpa:
58
+ cgpa_pct = (cgpa / 10.0) * 100
59
+ marks_std = np.std([tenth, twelfth, cgpa_pct])
60
+ features['marks_consistency'] = max(0, 1 - (marks_std / 30.0))
61
+ else:
62
+ features['marks_consistency'] = 0.5
63
+
64
+ # Academic improvement flag
65
+ if tenth and twelfth and cgpa:
66
+ cgpa_pct = (cgpa / 10.0) * 100
67
+ if cgpa_pct > twelfth and twelfth > tenth:
68
+ features['academic_improvement'] = 1.0
69
+ elif cgpa_pct > twelfth or twelfth > tenth:
70
+ features['academic_improvement'] = 0.7
71
+ else:
72
+ features['academic_improvement'] = 0.3
73
+ else:
74
+ features['academic_improvement'] = 0.5
75
+
76
+ # Extract features from text responses (handle None values)
77
+ internship_text = student_data.get('internship_text') or ''
78
+ ec_text = student_data.get('extracurricular_text') or ''
79
+ cert_text = student_data.get('certifications_text') or ''
80
+
81
+ # Internship exposure - extract from text
82
+ features['internship_exposure'] = self._assess_internship_quality(internship_text)
83
+
84
+ # Extracurricular quality - extract from text
85
+ features['ec_quality'] = self._assess_extracurricular_quality(ec_text)
86
+
87
+ # Certification quality - extract from text
88
+ features['cert_quality'] = self._assess_certification_quality(cert_text)
89
+
90
+ # Calculate weighted score
91
+ score = sum(features[k] * self.feature_weights[k] for k in features.keys())
92
+
93
+ # Calculate confidence based on data completeness
94
+ total_fields = 8
95
+ filled_fields = sum([
96
+ 1 if cgpa > 0 else 0,
97
+ 1 if len(sgpa_values) >= 2 else 0,
98
+ 1 if len(sgpa_values) >= 3 else 0,
99
+ 1 if tenth and twelfth else 0,
100
+ 1 if tenth and twelfth and cgpa else 0,
101
+ 1 if len(internship_text) > 20 else 0,
102
+ 1 if len(ec_text) > 20 else 0,
103
+ 1 if len(cert_text) > 20 else 0
104
+ ])
105
+ confidence = filled_fields / total_fields
106
+
107
+ return score, confidence, features
108
+
109
+ def explain(self, features: Dict) -> Dict:
110
+ """Generate explanation for scores"""
111
+ explanations = {
112
+ 'top_positive_features': [],
113
+ 'top_negative_features': []
114
+ }
115
+
116
+ # Sort features by value
117
+ sorted_features = sorted(features.items(), key=lambda x: x[1], reverse=True)
118
+
119
+ # Top 3 positive
120
+ for feat, val in sorted_features[:3]:
121
+ if val > 0.6:
122
+ explanations['top_positive_features'].append({
123
+ 'feature': feat,
124
+ 'value': round(val, 2),
125
+ 'description': self._get_feature_description(feat, val)
126
+ })
127
+
128
+ # Top 3 negative
129
+ for feat, val in sorted_features[-3:]:
130
+ if val < 0.4:
131
+ explanations['top_negative_features'].append({
132
+ 'feature': feat,
133
+ 'value': round(val, 2),
134
+ 'description': self._get_feature_description(feat, val)
135
+ })
136
+
137
+ return explanations
138
+
139
+ def _assess_internship_quality(self, text: str) -> float:
140
+ """Extract internship quality from text"""
141
+ if not text or len(text) < 20:
142
+ return 0.0
143
+
144
+ score = 0.0
145
+ text_lower = text.lower()
146
+
147
+ # Duration indicators
148
+ duration_patterns = [
149
+ (r'\b(\d+)\s*months?\b', 1.0),
150
+ (r'\b(\d+)\s*weeks?\b', 0.25),
151
+ (r'summer\s+internship', 0.5),
152
+ (r'year\s+long|full\s+year|annual', 1.0),
153
+ ]
154
+
155
+ max_duration_score = 0.0
156
+ for pattern, multiplier in duration_patterns:
157
+ matches = re.findall(pattern, text_lower)
158
+ if matches:
159
+ if pattern.startswith(r'\b(\d+)'):
160
+ duration = max([int(m) for m in matches]) * multiplier
161
+ max_duration_score = max(max_duration_score, min(duration / 6.0, 1.0))
162
+ else:
163
+ max_duration_score = max(max_duration_score, multiplier)
164
+
165
+ score += max_duration_score * 0.4
166
+
167
+ # Quality indicators
168
+ quality_keywords = ['company', 'startup', 'corporation', 'project', 'developed',
169
+ 'implemented', 'built', 'deployed', 'managed', 'led']
170
+ quality_count = sum(1 for kw in quality_keywords if kw in text_lower)
171
+ score += min(quality_count / len(quality_keywords), 1.0) * 0.4
172
+
173
+ # Length indicates detail
174
+ score += min(len(text) / 500, 1.0) * 0.2
175
+
176
+ return min(score, 1.0)
177
+
178
+ def _assess_extracurricular_quality(self, text: str) -> float:
179
+ """Extract extracurricular quality from text"""
180
+ if not text or len(text) < 20:
181
+ return 0.0
182
+
183
+ score = 0.0
184
+ text_lower = text.lower()
185
+
186
+ # Leadership indicators
187
+ leadership_keywords = ['led', 'organized', 'president', 'captain', 'head',
188
+ 'coordinator', 'managed', 'founded']
189
+ leadership_count = sum(1 for kw in leadership_keywords if kw in text_lower)
190
+ score += min(leadership_count / 3, 1.0) * 0.4
191
+
192
+ # Activity types
193
+ activity_keywords = ['club', 'society', 'competition', 'hackathon', 'event',
194
+ 'volunteer', 'sports', 'cultural', 'technical']
195
+ activity_count = sum(1 for kw in activity_keywords if kw in text_lower)
196
+ score += min(activity_count / 4, 1.0) * 0.4
197
+
198
+ # Detail level
199
+ score += min(len(text) / 400, 1.0) * 0.2
200
+
201
+ return min(score, 1.0)
202
+
203
+ def _assess_certification_quality(self, text: str) -> float:
204
+ """Extract certification quality from text"""
205
+ if not text or len(text) < 20:
206
+ return 0.0
207
+
208
+ score = 0.0
209
+ text_lower = text.lower()
210
+
211
+ # Platform indicators (reputable sources)
212
+ platform_keywords = ['coursera', 'udemy', 'edx', 'linkedin', 'google',
213
+ 'microsoft', 'aws', 'azure', 'ibm', 'oracle']
214
+ platform_count = sum(1 for kw in platform_keywords if kw in text_lower)
215
+ score += min(platform_count / 3, 1.0) * 0.4
216
+
217
+ # Technical skills
218
+ tech_keywords = ['python', 'java', 'machine learning', 'data science', 'cloud',
219
+ 'programming', 'development', 'database', 'web', 'mobile']
220
+ tech_count = sum(1 for kw in tech_keywords if kw in text_lower)
221
+ score += min(tech_count / 4, 1.0) * 0.4
222
+
223
+ # Detail level
224
+ score += min(len(text) / 400, 1.0) * 0.2
225
+
226
+ return min(score, 1.0)
227
+
228
+ def _get_feature_description(self, feature: str, value: float) -> str:
229
+ """Get human-readable description of feature"""
230
+ descriptions = {
231
+ 'cgpa_norm': f"CGPA performance: {value*10:.1f}/10",
232
+ 'sgpa_trend': "Strong upward trend in semester grades" if value > 0.6 else "Declining semester grades",
233
+ 'sgpa_consistency': "Very consistent semester performance" if value > 0.7 else "Inconsistent semester performance",
234
+ 'marks_consistency': "Consistent performance across academics" if value > 0.7 else "Variable academic performance",
235
+ 'academic_improvement': "Clear improvement over time" if value > 0.7 else "Limited academic growth",
236
+ 'internship_exposure': "Strong internship experience" if value > 0.6 else "Limited internship exposure",
237
+ 'ec_quality': "Excellent extracurricular involvement" if value > 0.6 else "Limited extracurricular activities",
238
+ 'cert_quality': "Strong certification portfolio" if value > 0.6 else "Few professional certifications"
239
+ }
240
+ return descriptions.get(feature, feature)