Spaces:

akshit7093
/

Student_Analyzer

Sleeping

App Files Files Community

Student_Analyzer / rag_system.py

joker7094

optimize HF storage with caching and switch to gemini-2.5-flash

c133164 2 months ago

raw

history blame contribute delete

25.3 kB

	# rag_system.py - Enhanced for deeper analysis

	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.output_parsers import JsonOutputParser
	from prompts import REPORT_PROMPT, QA_PROMPT, StudentReport, RESUME_TAILORING_PROMPT
	import json
	from langchain_core.prompts import PromptTemplate
	import os
	import re
	import logging
	from youtube_search_tool import YouTubeSearchTool
	from job_scraper import JobApplicationAnalyzer
	from dashboard_analyzer import get_dashboard_metrics

	logger = logging.getLogger('rag_system')
	DATA_PATH = "final_cleaned_student_data.json"

	class StudentApiRAG:
	def __init__(self):
	print("🚀 Initializing Enhanced RAG System with Deep Analysis...")
	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	raise ValueError("GOOGLE_API_KEY environment variable not set!")

	# Use more creative temperature for detailed, nuanced analysis
	self.llm = ChatGoogleGenerativeAI(
	model="models/gemini-2.5-flash", # Use Pro for better analysis
	google_api_key=api_key,
	temperature=0.4, # Increased for more creative, detailed responses
	top_p=0.95,
	top_k=40
	)

	# Secondary LLM for structured output (lower temperature)
	self.structured_llm = ChatGoogleGenerativeAI(
	model="models/gemini-2.5-flash",
	google_api_key=api_key,
	temperature=0.2, # Lower for consistent JSON structure
	top_p=0.9
	)

	self.youtube_tool = YouTubeSearchTool()

	print("📚 Loading student data into memory...")
	with open(DATA_PATH, 'r', encoding='utf-8') as f:
	self.student_data = json.load(f)
	print(f"✅ Loaded data for {len(self.student_data)} students.")
	print("🎯 Enhanced analysis engine ready for comprehensive reports!")

	self.job_analyzer = JobApplicationAnalyzer()

	self.topic_categories = {
	"DSA": [
	"Arrays", "Strings", "Linked Lists", "Stacks", "Queues",
	"Trees", "Graphs", "Heaps", "Hashing", "Binary Search",
	"Dynamic Programming", "Greedy Algorithms", "Backtracking",
	"Bit Manipulation", "Math", "Sorting", "Searching", "AIDS303", "AIDS353"
	],
	"Web Development": [
	"HTML", "CSS", "JavaScript", "React", "Angular", "Vue",
	"Node.js", "Express", "Django", "Flask", "REST APIs",
	"TypeScript", "Webpack", "Babel", "CSS Frameworks"
	],
	"Programming Languages": [
	"Python", "Java", "C++", "C#", "JavaScript", "TypeScript",
	"Go", "Rust", "Ruby", "PHP", "Swift", "Kotlin"
	],
	"Computer Science Fundamentals": [
	"Operating Systems", "Computer Networks", "Database Systems",
	"Compilers", "Computer Architecture", "Distributed Systems",
	"Artificial Intelligence", "Machine Learning", "Data Science",
	"Cloud Computing", "Cybersecurity"
	]
	}

	def _determine_sources_from_query(self, query: str) -> list:
	query = query.lower()
	sources = []
	if any(keyword in query for keyword in ["dsa", "problem solving", "coding", "leetcode", "codeforces", "resume", "cv", "skills", "video", "youtube", "tutorial"]):
	sources.extend(["leetcode", "codeforces", "resume"])
	if any(keyword in query for keyword in ["project", "experience", "github", "code", "repository"]):
	sources.append("github")
	if any(keyword in query for keyword in ["academic", "grade", "gpa", "cgpa", "subject", "marks", "semester"]):
	sources.append("academic_profile")
	return list(set(sources)) if sources else ["academic_profile", "coding_profiles", "resume"]

	def _identify_learning_topics(self, student_report: dict) -> list:
	"""Have the AI identify specific topic areas where the student needs improvement."""
	print(" 🎯 Identifying personalized learning topics...")

	weaknesses = student_report.get("analysis", {}).get("weaknesses", [])
	strengths = student_report.get("analysis", {}).get("strengths", [])

	# Extract scores
	dev_orientation_score = 5
	dsa_orientation_score = 5

	for score in student_report.get("detailed_scores", []):
	if "Development" in score["parameter"] or "Project" in score["parameter"]:
	dev_orientation_score = score["score"]
	if "DSA" in score["parameter"] or "Problem" in score["parameter"]:
	dsa_orientation_score = score["score"]

	prompt_template = """
	As an expert learning advisor, analyze this student's profile and identify 4-6 specific,
	actionable learning topics where they need the most improvement.

	Student Profile Analysis:
	- DSA Proficiency: {dsa_orientation_score}/10
	- Development Skills: {dev_orientation_score}/10
	- Key Strengths: {strengths}
	- Areas for Growth: {weaknesses}

	For each recommended topic:
	1. Choose a SPECIFIC, searchable topic (e.g., "Dynamic Programming Patterns", "React Hooks", "System Design Basics")
	2. Explain WHY this is critical for the student's growth (50-100 words)
	3. Ensure the topic has quality YouTube content available
	4. Prioritize high-impact areas that address their weaknesses

	Return ONLY valid JSON in this EXACT format:
	[
	{{
	"topic": "Binary Search and Two Pointers",
	"reason": "Your LeetCode profile shows only 15% accuracy on searching problems. These patterns are fundamental building blocks appearing in 30% of technical interviews. Mastering binary search variants and two-pointer techniques will unlock solutions to 50+ common problem types and significantly improve your problem-solving speed."
	}},
	{{
	"topic": "React State Management with Redux",
	"reason": "Your projects show basic React knowledge but lack complex state management. As applications scale, Redux becomes essential. Learning this now will make your projects production-ready and is a must-have skill for 70% of frontend positions at product companies."
	}}
	]

	Requirements:
	- Return 4-6 topics
	- Use double quotes for all JSON keys and values
	- Each reason should be 50-100 words
	- Topics must be specific and searchable
	- No extra text before or after the JSON array
	"""

	try:
	chain = PromptTemplate(
	template=prompt_template,
	input_variables=["dsa_orientation_score", "dev_orientation_score", "strengths", "weaknesses"]
	) \| self.llm

	response = chain.invoke({
	"dsa_orientation_score": dsa_orientation_score,
	"dev_orientation_score": dev_orientation_score,
	"strengths": ', '.join(strengths[:3]) if strengths else 'None specifically identified',
	"weaknesses": ', '.join(weaknesses[:3]) if weaknesses else 'None specifically identified'
	})

	response_text = response.content

	# Extract JSON
	json_start = response_text.find('[')
	json_end = response_text.rfind(']') + 1

	if json_start == -1 or json_end == 0:
	logger.error("No JSON array found in topic identification")
	return self._get_default_topics()

	json_text = response_text[json_start:json_end]
	json_text = json_text.replace('\n', ' ').replace('\r', '')

	try:
	topics_data = json.loads(json_text)
	except json.JSONDecodeError as e:
	logger.error(f"JSON parsing error: {e}")
	# Try fixing common issues
	fixed_json = json_text.replace("'", '"')
	try:
	topics_data = json.loads(fixed_json)
	except:
	return self._get_default_topics()

	if not isinstance(topics_data, list):
	logger.error("Topic data is not a list")
	return self._get_default_topics()

	# Validate and process topics
	valid_topics = []
	for item in topics_data[:6]: # Max 6 topics
	topic = item.get("topic", "").strip()
	reason = item.get("reason", "").strip()

	if topic and reason and len(reason) > 30: # Ensure substantial reason
	valid_topics.append({
	"topic": topic,
	"reason": reason,
	"category": self._determine_topic_category(topic)
	})

	if not valid_topics:
	logger.warning("No valid topics identified, using defaults")
	return self._get_default_topics()

	print(f" ✅ Identified {len(valid_topics)} personalized learning topics")
	return valid_topics

	except Exception as e:
	logger.error(f"Error identifying topics: {e}")
	return self._get_default_topics()

	def generate_tailored_resume(self, enrollment_no: str, job_description: str) -> str:
	"""Generates a tailored resume in Markdown format based on the job description."""
	print(f"📄 Generating tailored resume for {enrollment_no}...")

	# 1. Get Student Profile
	student_profile = self.student_data.get(enrollment_no) # Assuming _get_student_context is replaced by direct access
	if not student_profile:
	return "Error: Student profile not found."

	# 2. Prepare Prompt
	prompt = RESUME_TAILORING_PROMPT.format(
	student_profile=json.dumps(student_profile, indent=2), # Convert dict to JSON string for prompt
	job_description=job_description
	)

	# 3. Call LLM
	try:
	response = self.llm.invoke(prompt)
	return response.content
	except Exception as e:
	logger.error(f"Error generating resume: {e}")
	return f"Error generating resume: {str(e)}"

	def _determine_topic_category(self, topic: str) -> str:
	"""Determine the most appropriate category for a topic."""
	topic_lower = topic.lower()

	for category, topics in self.topic_categories.items():
	for predefined_topic in topics:
	if predefined_topic.lower() in topic_lower or topic_lower in predefined_topic.lower():
	return category

	# Fallback categorization
	if any(kw in topic_lower for kw in ["algorithm", "data structure", "dsa", "binary", "dynamic", "greedy", "tree", "graph", "array", "string"]):
	return "DSA"
	elif any(kw in topic_lower for kw in ["web", "react", "angular", "vue", "node", "express", "api", "html", "css", "javascript"]):
	return "Web Development"
	elif any(kw in topic_lower for kw in ["python", "java", "c++", "c#", "javascript", "go", "rust"]):
	return "Programming Languages"

	return "Computer Science Fundamentals"

	def _get_default_topics(self) -> list:
	"""Return default topics with detailed reasons."""
	return [
	{
	"topic": "Dynamic Programming Fundamentals",
	"reason": "Dynamic Programming is a critical problem-solving technique that appears in 25-30% of technical interviews at top companies. It's essential for optimization problems and demonstrates strong algorithmic thinking. Mastering DP patterns will significantly improve your problem-solving arsenal and interview success rate.",
	"category": "DSA"
	},
	{
	"topic": "Binary Search Variations",
	"reason": "Binary search is a fundamental algorithm that extends beyond simple array searching. Understanding its variations (rotated arrays, finding boundaries, search in unknown size arrays) unlocks solutions to 40+ LeetCode problems and is frequently tested in interviews. This skill demonstrates strong understanding of time complexity optimization.",
	"category": "DSA"
	},
	{
	"topic": "React Advanced Patterns",
	"reason": "Moving beyond basics to advanced React patterns (custom hooks, context API, render props, compound components) is crucial for building scalable applications. These patterns are used in production codebases at major companies and demonstrate senior-level frontend skills that command premium salaries.",
	"category": "Web Development"
	}
	]

	def _get_youtube_recommendations(self, student_report: dict) -> list:
	"""Generate comprehensive YouTube video recommendations."""
	print(" 📺 Generating personalized YouTube recommendations...")

	learning_topics = self._identify_learning_topics(student_report)
	topic_recommendations = []

	for topic_info in learning_topics:
	topic = topic_info["topic"]
	category = topic_info["category"]

	print(f" 🔍 Searching videos for: '{topic}' ({category})")

	try:
	youtube_videos = self.youtube_tool.run({
	"query": topic,
	"max_results": 5,
	"topic_category": category
	})

	topic_videos = [{
	"title": video["title"],
	"url": video["url"],
	"embed_url": video["embed_url"],
	"reason": video["description"]
	} for video in youtube_videos]

	topic_recommendations.append({
	"topic": topic,
	"reason": topic_info["reason"],
	"category": category,
	"videos": topic_videos
	})

	print(f" ✅ Found {len(topic_videos)} high-quality videos")

	except Exception as e:
	print(f" ⚠️ Error fetching videos for '{topic}': {e}")
	fallback_videos = self.youtube_tool._get_fallback_videos(topic, 5, category)
	topic_videos = [{
	"title": video["title"],
	"url": video["url"],
	"embed_url": video["embed_url"],
	"reason": video["description"]
	} for video in fallback_videos]

	topic_recommendations.append({
	"topic": topic,
	"reason": topic_info["reason"],
	"category": category,
	"videos": topic_videos
	})

	print(f" ✅ Generated {len(topic_recommendations)} comprehensive learning modules")
	return topic_recommendations

	def generate_structured_report(self, enrollment_no: str) -> dict:
	"""Generate comprehensive student report with deep analysis."""
	print(f"\n{'='*80}")
	print(f"🎓 GENERATING COMPREHENSIVE REPORT FOR: {enrollment_no}")
	print(f"{'='*80}\n")

	student_profile = self.student_data.get(enrollment_no)
	if not student_profile:
	return {"error": "No data found for this student."}

	context = json.dumps(student_profile, indent=2)

	# Use structured LLM for JSON parsing
	parser = JsonOutputParser(pydantic_object=StudentReport)
	prompt_with_format = REPORT_PROMPT.partial(
	format_instructions=parser.get_format_instructions()
	)

	chain = prompt_with_format \| self.structured_llm \| parser

	try:
	print("🤖 AI analyzing student profile comprehensively...")
	report_dict = chain.invoke({"context": context})

	# Inject CGPA Trend Data
	try:
	semester_performance = student_profile.get("academic_profile", {}).get("semester_performance", [])
	if semester_performance:
	labels = [f"Sem {sem['semester']}" for sem in semester_performance]
	values = [sem['sgpa'] for sem in semester_performance]
	report_dict["cgpa_trend"] = {
	"labels": labels,
	"values": values
	}
	print(f" 📊 Injected CGPA trend: {len(values)} semesters")
	else:
	report_dict["cgpa_trend"] = None
	except Exception as e:
	print(f" ⚠️ CGPA trend extraction failed: {e}")
	report_dict["cgpa_trend"] = None

	# Generate video recommendations
	try:
	print("\n📹 Generating personalized learning resources...")
	youtube_recommendations = self._get_youtube_recommendations(report_dict)
	report_dict["youtube_recommendations"] = youtube_recommendations
	print(f" ✅ Added {len(youtube_recommendations)} curated learning modules")
	except Exception as e:
	print(f" ⚠️ Video recommendations failed: {e}")
	report_dict["youtube_recommendations"] = self._get_default_topic_recommendations()

	print(f"\n{'='*80}")
	print("✅ COMPREHENSIVE REPORT GENERATION COMPLETE!")
	print(f"{'='*80}\n")

	return report_dict

	except Exception as e:
	logger.error(f"Report generation error: {e}", exc_info=True)
	print(f"\n❌ ERROR: {e}\n")
	return {
	"error": "Failed to generate report",
	"overall_summary": "Report generation encountered an error. Please try again.",
	"executive_summary": "Error generating analysis.",
	"detailed_scores": [],
	"analysis": {
	"strengths": ["System error occurred"],
	"weaknesses": ["Unable to analyze due to technical issue"],
	"hidden_talents": []
	},
	"actionable_advice": {
	"recommendations": [{
	"title": "System Error",
	"description": "Please try generating the report again or contact support.",
	"priority": "HIGH",
	"estimated_time": "N/A",
	"expected_impact": "N/A",
	"mermaid_flowchart": ""
	}]
	},
	"resume_analysis": {
	"summary": "Analysis unavailable",
	"key_skills": [],
	"professional_links": [],
	"missing_elements": [],
	"ats_score": 0,
	"improvement_suggestions": []
	},
	"skills": [],
	"learning_path": [],
	"career_insights": {
	"current_trajectory": "Analysis unavailable",
	"potential_roles": [],
	"salary_range": "N/A",
	"competitive_advantage": "N/A",
	"market_positioning": "N/A"
	},
	"youtube_recommendations": self._get_default_topic_recommendations()
	}

	def _get_default_topic_recommendations(self) -> list:
	"""Return default comprehensive recommendations."""
	default_topics = self._get_default_topics()
	topic_recommendations = []

	for topic_info in default_topics:
	topic = topic_info["topic"]
	category = topic_info["category"]
	fallback_videos = self.youtube_tool._get_fallback_videos(topic, 5, category)

	topic_videos = [{
	"title": video["title"],
	"url": video["url"],
	"embed_url": video["embed_url"],
	"reason": video["description"]
	} for video in fallback_videos]

	topic_recommendations.append({
	"topic": topic,
	"reason": topic_info["reason"],
	"category": category,
	"videos": topic_videos
	})

	return topic_recommendations

	def analyze_job_application(self, job_application_link: str, enrollment_no: str) -> dict:
	"""Analyze student profile against job requirements."""
	print(f"\n🎯 Starting comprehensive job analysis...")
	print(f" Student: {enrollment_no}")
	print(f" Job Link: {job_application_link}\n")

	student_profile = self.student_data.get(enrollment_no)
	if not student_profile:
	logger.error(f"Student not found: {enrollment_no}")
	return {
	"error": "Student data not found",
	"strategic_overview": {
	"summary": "Error: Student data unavailable",
	"your_key_opportunity": "Please verify enrollment number"
	},
	"your_core_strengths_for_this_role": [],
	"strategic_areas_for_growth": [],
	"video_recommendations": []
	}

	analysis_result = self.job_analyzer.analyze(job_application_link, student_profile)
	print("✅ Job analysis complete!\n")

	return analysis_result

	def get_student_dashboard_metrics(self, enrollment_no: str) -> dict:
	"""Get comprehensive dashboard metrics."""
	print(f"📊 Calculating comprehensive metrics for: {enrollment_no}")

	student_profile = self.student_data.get(enrollment_no)
	if not student_profile:
	logger.error(f"Student not found: {enrollment_no}")
	return {"error": "Student data not found"}

	metrics = get_dashboard_metrics(student_profile)
	print("✅ Dashboard metrics calculated\n")

	return metrics

	def answer_question(self, query: str, enrollment_no: str) -> str:
	"""Answer questions with detailed, comprehensive responses."""
	print(f"\n💬 Answering question for {enrollment_no}")
	print(f" Query: {query}\n")

	student_profile = self.student_data.get(enrollment_no)
	if not student_profile:
	return "❌ Could not find data for the selected student."

	sources_to_use = self._determine_sources_from_query(query)
	print(f" 📂 Using data sources: {', '.join(sources_to_use)}")

	# Build targeted context
	targeted_context = {}
	if "academic_profile" in sources_to_use:
	targeted_context["academic_profile"] = student_profile.get("academic_profile")

	coding_profiles = {}
	if "leetcode" in sources_to_use:
	coding_profiles["leetcode"] = student_profile.get("coding_profiles", {}).get("leetcode")
	if "github" in sources_to_use:
	coding_profiles["github"] = student_profile.get("coding_profiles", {}).get("github")
	if "codeforces" in sources_to_use:
	coding_profiles["codeforces"] = student_profile.get("coding_profiles", {}).get("codeforces")

	if coding_profiles:
	targeted_context["coding_profiles"] = coding_profiles
	if "coding_profiles" in sources_to_use and not coding_profiles:
	targeted_context["coding_profiles"] = student_profile.get("coding_profiles")
	if "resume" in sources_to_use:
	targeted_context["resume"] = student_profile.get("resume")

	if not targeted_context:
	return "❌ Could not find relevant information in the student's profile to answer that question."

	context_str = json.dumps(targeted_context, indent=2)
	chain = QA_PROMPT \| self.llm
	result = chain.invoke({"context": context_str, "question": query})

	print(" ✅ Response generated\n")
	return result.content

	def get_all_students_summary(self) -> list:
	"""Returns a summary list of all students."""
	summaries = []
	for enrollment_no, data in self.student_data.items():
	summaries.append({
	"enrollment_no": enrollment_no,
	"name": data.get("personal_info", {}).get("name", "Unknown"),
	"cgpa": data.get("academic_performance", {}).get("current_cgpa", "N/A"),
	"key_skills": data.get("skills", {}).get("technical_skills", [])[:3] # Top 3 skills
	})
	return summaries