Spaces:
Sleeping
Sleeping
File size: 25,328 Bytes
0583f91 8098153 f4552a1 0583f91 8098153 f4552a1 8098153 f4552a1 0583f91 8098153 f4552a1 8098153 0583f91 8098153 0583f91 8098153 c133164 8098153 0583f91 8098153 0583f91 c133164 0583f91 f4552a1 0583f91 8098153 0583f91 f4552a1 8098153 f4552a1 8098153 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 8098153 0583f91 8098153 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 0583f91 f4552a1 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 0583f91 8098153 f4552a1 8098153 0583f91 8098153 0583f91 f4552a1 0583f91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 |
# rag_system.py - Enhanced for deeper analysis
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import JsonOutputParser
from prompts import REPORT_PROMPT, QA_PROMPT, StudentReport, RESUME_TAILORING_PROMPT
import json
from langchain_core.prompts import PromptTemplate
import os
import re
import logging
from youtube_search_tool import YouTubeSearchTool
from job_scraper import JobApplicationAnalyzer
from dashboard_analyzer import get_dashboard_metrics
logger = logging.getLogger('rag_system')
DATA_PATH = "final_cleaned_student_data.json"
class StudentApiRAG:
def __init__(self):
print("π Initializing Enhanced RAG System with Deep Analysis...")
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY environment variable not set!")
# Use more creative temperature for detailed, nuanced analysis
self.llm = ChatGoogleGenerativeAI(
model="models/gemini-2.5-flash", # Use Pro for better analysis
google_api_key=api_key,
temperature=0.4, # Increased for more creative, detailed responses
top_p=0.95,
top_k=40
)
# Secondary LLM for structured output (lower temperature)
self.structured_llm = ChatGoogleGenerativeAI(
model="models/gemini-2.5-flash",
google_api_key=api_key,
temperature=0.2, # Lower for consistent JSON structure
top_p=0.9
)
self.youtube_tool = YouTubeSearchTool()
print("π Loading student data into memory...")
with open(DATA_PATH, 'r', encoding='utf-8') as f:
self.student_data = json.load(f)
print(f"β
Loaded data for {len(self.student_data)} students.")
print("π― Enhanced analysis engine ready for comprehensive reports!")
self.job_analyzer = JobApplicationAnalyzer()
self.topic_categories = {
"DSA": [
"Arrays", "Strings", "Linked Lists", "Stacks", "Queues",
"Trees", "Graphs", "Heaps", "Hashing", "Binary Search",
"Dynamic Programming", "Greedy Algorithms", "Backtracking",
"Bit Manipulation", "Math", "Sorting", "Searching", "AIDS303", "AIDS353"
],
"Web Development": [
"HTML", "CSS", "JavaScript", "React", "Angular", "Vue",
"Node.js", "Express", "Django", "Flask", "REST APIs",
"TypeScript", "Webpack", "Babel", "CSS Frameworks"
],
"Programming Languages": [
"Python", "Java", "C++", "C#", "JavaScript", "TypeScript",
"Go", "Rust", "Ruby", "PHP", "Swift", "Kotlin"
],
"Computer Science Fundamentals": [
"Operating Systems", "Computer Networks", "Database Systems",
"Compilers", "Computer Architecture", "Distributed Systems",
"Artificial Intelligence", "Machine Learning", "Data Science",
"Cloud Computing", "Cybersecurity"
]
}
def _determine_sources_from_query(self, query: str) -> list:
query = query.lower()
sources = []
if any(keyword in query for keyword in ["dsa", "problem solving", "coding", "leetcode", "codeforces", "resume", "cv", "skills", "video", "youtube", "tutorial"]):
sources.extend(["leetcode", "codeforces", "resume"])
if any(keyword in query for keyword in ["project", "experience", "github", "code", "repository"]):
sources.append("github")
if any(keyword in query for keyword in ["academic", "grade", "gpa", "cgpa", "subject", "marks", "semester"]):
sources.append("academic_profile")
return list(set(sources)) if sources else ["academic_profile", "coding_profiles", "resume"]
def _identify_learning_topics(self, student_report: dict) -> list:
"""Have the AI identify specific topic areas where the student needs improvement."""
print(" π― Identifying personalized learning topics...")
weaknesses = student_report.get("analysis", {}).get("weaknesses", [])
strengths = student_report.get("analysis", {}).get("strengths", [])
# Extract scores
dev_orientation_score = 5
dsa_orientation_score = 5
for score in student_report.get("detailed_scores", []):
if "Development" in score["parameter"] or "Project" in score["parameter"]:
dev_orientation_score = score["score"]
if "DSA" in score["parameter"] or "Problem" in score["parameter"]:
dsa_orientation_score = score["score"]
prompt_template = """
As an expert learning advisor, analyze this student's profile and identify 4-6 specific,
actionable learning topics where they need the most improvement.
Student Profile Analysis:
- DSA Proficiency: {dsa_orientation_score}/10
- Development Skills: {dev_orientation_score}/10
- Key Strengths: {strengths}
- Areas for Growth: {weaknesses}
For each recommended topic:
1. Choose a SPECIFIC, searchable topic (e.g., "Dynamic Programming Patterns", "React Hooks", "System Design Basics")
2. Explain WHY this is critical for the student's growth (50-100 words)
3. Ensure the topic has quality YouTube content available
4. Prioritize high-impact areas that address their weaknesses
Return ONLY valid JSON in this EXACT format:
[
{{
"topic": "Binary Search and Two Pointers",
"reason": "Your LeetCode profile shows only 15% accuracy on searching problems. These patterns are fundamental building blocks appearing in 30% of technical interviews. Mastering binary search variants and two-pointer techniques will unlock solutions to 50+ common problem types and significantly improve your problem-solving speed."
}},
{{
"topic": "React State Management with Redux",
"reason": "Your projects show basic React knowledge but lack complex state management. As applications scale, Redux becomes essential. Learning this now will make your projects production-ready and is a must-have skill for 70% of frontend positions at product companies."
}}
]
Requirements:
- Return 4-6 topics
- Use double quotes for all JSON keys and values
- Each reason should be 50-100 words
- Topics must be specific and searchable
- No extra text before or after the JSON array
"""
try:
chain = PromptTemplate(
template=prompt_template,
input_variables=["dsa_orientation_score", "dev_orientation_score", "strengths", "weaknesses"]
) | self.llm
response = chain.invoke({
"dsa_orientation_score": dsa_orientation_score,
"dev_orientation_score": dev_orientation_score,
"strengths": ', '.join(strengths[:3]) if strengths else 'None specifically identified',
"weaknesses": ', '.join(weaknesses[:3]) if weaknesses else 'None specifically identified'
})
response_text = response.content
# Extract JSON
json_start = response_text.find('[')
json_end = response_text.rfind(']') + 1
if json_start == -1 or json_end == 0:
logger.error("No JSON array found in topic identification")
return self._get_default_topics()
json_text = response_text[json_start:json_end]
json_text = json_text.replace('\n', ' ').replace('\r', '')
try:
topics_data = json.loads(json_text)
except json.JSONDecodeError as e:
logger.error(f"JSON parsing error: {e}")
# Try fixing common issues
fixed_json = json_text.replace("'", '"')
try:
topics_data = json.loads(fixed_json)
except:
return self._get_default_topics()
if not isinstance(topics_data, list):
logger.error("Topic data is not a list")
return self._get_default_topics()
# Validate and process topics
valid_topics = []
for item in topics_data[:6]: # Max 6 topics
topic = item.get("topic", "").strip()
reason = item.get("reason", "").strip()
if topic and reason and len(reason) > 30: # Ensure substantial reason
valid_topics.append({
"topic": topic,
"reason": reason,
"category": self._determine_topic_category(topic)
})
if not valid_topics:
logger.warning("No valid topics identified, using defaults")
return self._get_default_topics()
print(f" β
Identified {len(valid_topics)} personalized learning topics")
return valid_topics
except Exception as e:
logger.error(f"Error identifying topics: {e}")
return self._get_default_topics()
def generate_tailored_resume(self, enrollment_no: str, job_description: str) -> str:
"""Generates a tailored resume in Markdown format based on the job description."""
print(f"π Generating tailored resume for {enrollment_no}...")
# 1. Get Student Profile
student_profile = self.student_data.get(enrollment_no) # Assuming _get_student_context is replaced by direct access
if not student_profile:
return "Error: Student profile not found."
# 2. Prepare Prompt
prompt = RESUME_TAILORING_PROMPT.format(
student_profile=json.dumps(student_profile, indent=2), # Convert dict to JSON string for prompt
job_description=job_description
)
# 3. Call LLM
try:
response = self.llm.invoke(prompt)
return response.content
except Exception as e:
logger.error(f"Error generating resume: {e}")
return f"Error generating resume: {str(e)}"
def _determine_topic_category(self, topic: str) -> str:
"""Determine the most appropriate category for a topic."""
topic_lower = topic.lower()
for category, topics in self.topic_categories.items():
for predefined_topic in topics:
if predefined_topic.lower() in topic_lower or topic_lower in predefined_topic.lower():
return category
# Fallback categorization
if any(kw in topic_lower for kw in ["algorithm", "data structure", "dsa", "binary", "dynamic", "greedy", "tree", "graph", "array", "string"]):
return "DSA"
elif any(kw in topic_lower for kw in ["web", "react", "angular", "vue", "node", "express", "api", "html", "css", "javascript"]):
return "Web Development"
elif any(kw in topic_lower for kw in ["python", "java", "c++", "c#", "javascript", "go", "rust"]):
return "Programming Languages"
return "Computer Science Fundamentals"
def _get_default_topics(self) -> list:
"""Return default topics with detailed reasons."""
return [
{
"topic": "Dynamic Programming Fundamentals",
"reason": "Dynamic Programming is a critical problem-solving technique that appears in 25-30% of technical interviews at top companies. It's essential for optimization problems and demonstrates strong algorithmic thinking. Mastering DP patterns will significantly improve your problem-solving arsenal and interview success rate.",
"category": "DSA"
},
{
"topic": "Binary Search Variations",
"reason": "Binary search is a fundamental algorithm that extends beyond simple array searching. Understanding its variations (rotated arrays, finding boundaries, search in unknown size arrays) unlocks solutions to 40+ LeetCode problems and is frequently tested in interviews. This skill demonstrates strong understanding of time complexity optimization.",
"category": "DSA"
},
{
"topic": "React Advanced Patterns",
"reason": "Moving beyond basics to advanced React patterns (custom hooks, context API, render props, compound components) is crucial for building scalable applications. These patterns are used in production codebases at major companies and demonstrate senior-level frontend skills that command premium salaries.",
"category": "Web Development"
}
]
def _get_youtube_recommendations(self, student_report: dict) -> list:
"""Generate comprehensive YouTube video recommendations."""
print(" πΊ Generating personalized YouTube recommendations...")
learning_topics = self._identify_learning_topics(student_report)
topic_recommendations = []
for topic_info in learning_topics:
topic = topic_info["topic"]
category = topic_info["category"]
print(f" π Searching videos for: '{topic}' ({category})")
try:
youtube_videos = self.youtube_tool.run({
"query": topic,
"max_results": 5,
"topic_category": category
})
topic_videos = [{
"title": video["title"],
"url": video["url"],
"embed_url": video["embed_url"],
"reason": video["description"]
} for video in youtube_videos]
topic_recommendations.append({
"topic": topic,
"reason": topic_info["reason"],
"category": category,
"videos": topic_videos
})
print(f" β
Found {len(topic_videos)} high-quality videos")
except Exception as e:
print(f" β οΈ Error fetching videos for '{topic}': {e}")
fallback_videos = self.youtube_tool._get_fallback_videos(topic, 5, category)
topic_videos = [{
"title": video["title"],
"url": video["url"],
"embed_url": video["embed_url"],
"reason": video["description"]
} for video in fallback_videos]
topic_recommendations.append({
"topic": topic,
"reason": topic_info["reason"],
"category": category,
"videos": topic_videos
})
print(f" β
Generated {len(topic_recommendations)} comprehensive learning modules")
return topic_recommendations
def generate_structured_report(self, enrollment_no: str) -> dict:
"""Generate comprehensive student report with deep analysis."""
print(f"\n{'='*80}")
print(f"π GENERATING COMPREHENSIVE REPORT FOR: {enrollment_no}")
print(f"{'='*80}\n")
student_profile = self.student_data.get(enrollment_no)
if not student_profile:
return {"error": "No data found for this student."}
context = json.dumps(student_profile, indent=2)
# Use structured LLM for JSON parsing
parser = JsonOutputParser(pydantic_object=StudentReport)
prompt_with_format = REPORT_PROMPT.partial(
format_instructions=parser.get_format_instructions()
)
chain = prompt_with_format | self.structured_llm | parser
try:
print("π€ AI analyzing student profile comprehensively...")
report_dict = chain.invoke({"context": context})
# Inject CGPA Trend Data
try:
semester_performance = student_profile.get("academic_profile", {}).get("semester_performance", [])
if semester_performance:
labels = [f"Sem {sem['semester']}" for sem in semester_performance]
values = [sem['sgpa'] for sem in semester_performance]
report_dict["cgpa_trend"] = {
"labels": labels,
"values": values
}
print(f" π Injected CGPA trend: {len(values)} semesters")
else:
report_dict["cgpa_trend"] = None
except Exception as e:
print(f" β οΈ CGPA trend extraction failed: {e}")
report_dict["cgpa_trend"] = None
# Generate video recommendations
try:
print("\nπΉ Generating personalized learning resources...")
youtube_recommendations = self._get_youtube_recommendations(report_dict)
report_dict["youtube_recommendations"] = youtube_recommendations
print(f" β
Added {len(youtube_recommendations)} curated learning modules")
except Exception as e:
print(f" β οΈ Video recommendations failed: {e}")
report_dict["youtube_recommendations"] = self._get_default_topic_recommendations()
print(f"\n{'='*80}")
print("β
COMPREHENSIVE REPORT GENERATION COMPLETE!")
print(f"{'='*80}\n")
return report_dict
except Exception as e:
logger.error(f"Report generation error: {e}", exc_info=True)
print(f"\nβ ERROR: {e}\n")
return {
"error": "Failed to generate report",
"overall_summary": "Report generation encountered an error. Please try again.",
"executive_summary": "Error generating analysis.",
"detailed_scores": [],
"analysis": {
"strengths": ["System error occurred"],
"weaknesses": ["Unable to analyze due to technical issue"],
"hidden_talents": []
},
"actionable_advice": {
"recommendations": [{
"title": "System Error",
"description": "Please try generating the report again or contact support.",
"priority": "HIGH",
"estimated_time": "N/A",
"expected_impact": "N/A",
"mermaid_flowchart": ""
}]
},
"resume_analysis": {
"summary": "Analysis unavailable",
"key_skills": [],
"professional_links": [],
"missing_elements": [],
"ats_score": 0,
"improvement_suggestions": []
},
"skills": [],
"learning_path": [],
"career_insights": {
"current_trajectory": "Analysis unavailable",
"potential_roles": [],
"salary_range": "N/A",
"competitive_advantage": "N/A",
"market_positioning": "N/A"
},
"youtube_recommendations": self._get_default_topic_recommendations()
}
def _get_default_topic_recommendations(self) -> list:
"""Return default comprehensive recommendations."""
default_topics = self._get_default_topics()
topic_recommendations = []
for topic_info in default_topics:
topic = topic_info["topic"]
category = topic_info["category"]
fallback_videos = self.youtube_tool._get_fallback_videos(topic, 5, category)
topic_videos = [{
"title": video["title"],
"url": video["url"],
"embed_url": video["embed_url"],
"reason": video["description"]
} for video in fallback_videos]
topic_recommendations.append({
"topic": topic,
"reason": topic_info["reason"],
"category": category,
"videos": topic_videos
})
return topic_recommendations
def analyze_job_application(self, job_application_link: str, enrollment_no: str) -> dict:
"""Analyze student profile against job requirements."""
print(f"\nπ― Starting comprehensive job analysis...")
print(f" Student: {enrollment_no}")
print(f" Job Link: {job_application_link}\n")
student_profile = self.student_data.get(enrollment_no)
if not student_profile:
logger.error(f"Student not found: {enrollment_no}")
return {
"error": "Student data not found",
"strategic_overview": {
"summary": "Error: Student data unavailable",
"your_key_opportunity": "Please verify enrollment number"
},
"your_core_strengths_for_this_role": [],
"strategic_areas_for_growth": [],
"video_recommendations": []
}
analysis_result = self.job_analyzer.analyze(job_application_link, student_profile)
print("β
Job analysis complete!\n")
return analysis_result
def get_student_dashboard_metrics(self, enrollment_no: str) -> dict:
"""Get comprehensive dashboard metrics."""
print(f"π Calculating comprehensive metrics for: {enrollment_no}")
student_profile = self.student_data.get(enrollment_no)
if not student_profile:
logger.error(f"Student not found: {enrollment_no}")
return {"error": "Student data not found"}
metrics = get_dashboard_metrics(student_profile)
print("β
Dashboard metrics calculated\n")
return metrics
def answer_question(self, query: str, enrollment_no: str) -> str:
"""Answer questions with detailed, comprehensive responses."""
print(f"\n㪠Answering question for {enrollment_no}")
print(f" Query: {query}\n")
student_profile = self.student_data.get(enrollment_no)
if not student_profile:
return "β Could not find data for the selected student."
sources_to_use = self._determine_sources_from_query(query)
print(f" π Using data sources: {', '.join(sources_to_use)}")
# Build targeted context
targeted_context = {}
if "academic_profile" in sources_to_use:
targeted_context["academic_profile"] = student_profile.get("academic_profile")
coding_profiles = {}
if "leetcode" in sources_to_use:
coding_profiles["leetcode"] = student_profile.get("coding_profiles", {}).get("leetcode")
if "github" in sources_to_use:
coding_profiles["github"] = student_profile.get("coding_profiles", {}).get("github")
if "codeforces" in sources_to_use:
coding_profiles["codeforces"] = student_profile.get("coding_profiles", {}).get("codeforces")
if coding_profiles:
targeted_context["coding_profiles"] = coding_profiles
if "coding_profiles" in sources_to_use and not coding_profiles:
targeted_context["coding_profiles"] = student_profile.get("coding_profiles")
if "resume" in sources_to_use:
targeted_context["resume"] = student_profile.get("resume")
if not targeted_context:
return "β Could not find relevant information in the student's profile to answer that question."
context_str = json.dumps(targeted_context, indent=2)
chain = QA_PROMPT | self.llm
result = chain.invoke({"context": context_str, "question": query})
print(" β
Response generated\n")
return result.content
def get_all_students_summary(self) -> list:
"""Returns a summary list of all students."""
summaries = []
for enrollment_no, data in self.student_data.items():
summaries.append({
"enrollment_no": enrollment_no,
"name": data.get("personal_info", {}).get("name", "Unknown"),
"cgpa": data.get("academic_performance", {}).get("current_cgpa", "N/A"),
"key_skills": data.get("skills", {}).get("technical_skills", [])[:3] # Top 3 skills
})
return summaries |