Spaces:

akshit7093
/

Student_Analyzer

Sleeping

File size: 17,235 Bytes

# data_aggregator.py (Complete Version with Resume Parsing)

import json
import os
import re
import time
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('data_aggregator')

# Import the scraper functions and classes
from github_scraper import get_github_profile
from codeforces_scraper import get_codeforces_profile
from leetcode_scraper import get_leetcode_profile
from ipu_scraper import StudentScraper
# Import our resume parser
from resume_parser import parse_resume

# --- Configuration ---
# Define the list of students with resume paths
STUDENTS_TO_FETCH = [
    {
        "enrollment_no": "35214811922",
        "leetcode_user": "akshitsharma7093",
        "github_user": "akshit7093",
        "codeforces_user": "akshit7093",
        "resume_path": "resume.pdf"  # REQUIRED FIELD
    },
    {
        "enrollment_no": "35314811922",
        "leetcode_user": "Nikita_06211",
        "github_user": "Nikita06211",
        "codeforces_user": "Nikita06211",
        "resume_path": "Nikita_Bansal.pdf"
    },
    {
        "enrollment_no": "05414811922",
        "leetcode_user": "Vineet_Goyal10",
        "github_user": "Vineetg2003",
        "codeforces_user": "Nikita06211",
        "resume_path": "Vineet_Goyal_Resume (4).pdf"
    }

    # Add more student dictionaries here
]


OUTPUT_FILE = 'final_cleaned_student_data.json'

# --- Advanced Cleaning and Filtering Functions ---

def clean_ipu_data(raw_data):
    """Transforms raw IPU academic data into a final, clean format."""
    if not raw_data or raw_data.get("status") != "success":
        logger.warning("IPU data is empty or failed")
        return None

    overall = raw_data["academic_summary"]["overall_performance"]
    programme = raw_data["programme_info"]

    cleaned = {
        "institute": programme.get("institute", {}).get("insti_name"),
        "degree": programme.get("course", {}).get("course_name"),
        "branch": programme.get("branch", {}).get("branch_name"),
        "overall_cgpa": round(overall.get("cgpa", 0), 2),
        "overall_percentage": round(overall.get("percentage", 0), 2),
        "semester_performance": []
    }

    for sem_result in raw_data["academic_summary"]["semester_results"]:
        if sem_result.get("sgpa", 0) > 0:
            sem_data = {
                "semester": sem_result.get("result_no"),
                "sgpa": round(sem_result.get("sgpa", 0), 2), # Rounding SGPA
                "percentage": round(sem_result.get("percentage", 0), 2),
                "subjects": [
                    {
                        "subject": sub.get("subject_name"),
                        "grade": sub.get("grade"),
                        "marks": sub.get("total_marks")
                    }
                    for sub in sem_result.get("subject_results", [])
                ]
            }
            cleaned["semester_performance"].append(sem_data)
    
    return cleaned

def clean_leetcode_data(raw_data):
    """Cleans and filters LeetCode data, summarizing top skills."""
    if not raw_data:
        logger.warning("LeetCode data is empty")
        return None
        
    # Flatten all skills into a single list to find the absolute top skills
    all_skills = []
    for category in ["skillsAdvanced", "skillsIntermediate", "skillsFundamental"]:
        if raw_data.get(category):
            all_skills.extend(raw_data[category])
            
    # Sort by problems solved and take the top 15
    top_skills_sorted = sorted(all_skills, key=lambda x: x.get("problemsSolved", 0), reverse=True)
    top_skills_summary = [{"skill": s.get("tagName"), "solved": s.get("problemsSolved")} for s in top_skills_sorted[:15]]

    return {
        "username": raw_data.get("username"),
        "ranking": raw_data.get("ranking"),
        "totalSolved": raw_data.get("totalSolved"),
        "acceptanceRate": raw_data.get("acceptanceRate"),
        "problemsByDifficulty": raw_data.get("problemsSolvedByDifficulty"),
        "primaryLanguage": raw_data.get("languageStats", [{}])[0],
        "topSkillsSummary": top_skills_summary, # New summarized field
        "activity": {
            "currentStreak": raw_data.get("currentStreak"),
            "totalActiveDays": raw_data.get("totalActiveDays"),
        },
        "recentSubmissions": [
            {
                "title": sub.get("title"),
                "timestamp": datetime.fromtimestamp(int(sub.get("timestamp"))).strftime('%Y-%m-%d')
            }
            for sub in raw_data.get("recentAcSubmissions", [])
        ]
    }

def clean_github_data(raw_data):
    """Summarizes GitHub data, cleans README, and fixes pinned repo logic."""
    if not raw_data:
        logger.warning("GitHub data is empty")
        return None
        
    def summarize_repo(repo):
        # Create a dictionary only with non-null values
        summary = {k: v for k, v in {
            "name": repo.get("name"),
            "description": repo.get("description"),
            "language": repo.get("language"),
            "stars": repo.get("stargazers_count"),
            "forks": repo.get("forks_count"),
            "last_pushed": repo.get("pushed_at", "")[:10] # Truncate to date
        }.items() if v is not None}
        return summary

    def summarize_pinned_repo(repo):
        # Pinned repo scraper uses a different key for the name ('repo')
        summary = {k: v for k, v in {
            "name": repo.get("repo", "").strip(), # Clean whitespace
            "description": repo.get("description"),
            "language": repo.get("language"),
            "stars": int(repo.get("stars", 0)),
            "forks": repo.get("forks")
        }.items() if v is not None and v != ''}
        return summary

    # Clean the README by removing HTML/Markdown tags, image links, etc.
    readme_text = raw_data.get("user_readme", "")
    # Remove HTML tags
    readme_text = re.sub(r'<[^>]+>', '', readme_text)
    # Remove Markdown images and badges
    readme_text = re.sub(r'!\[[^\]]*\]\([^\)]*\)', '', readme_text)
    # Remove standalone links but keep link text
    readme_text = re.sub(r'\[([^\]]+)\]\([^\)]*\)', r'\1', readme_text)
    # Clean up excessive newlines
    readme_text = re.sub(r'\n\s*\n', '\n', readme_text).strip()

    return {
        "username": raw_data.get("login"),
        "name": raw_data.get("name"),
        "bio": raw_data.get("bio", "").strip() if raw_data.get("bio") else None,
        "stats": {
            "public_repos": raw_data.get("public_repos"),
            "followers": raw_data.get("followers"),
            "following": raw_data.get("following")
        },
        "cleaned_profile_readme": readme_text,
        "pinned_repositories": [summarize_pinned_repo(r) for r in raw_data.get("pinned_repos", [])],
        "top_repositories": [summarize_repo(r) for r in raw_data.get("repos", [])]
    }

def clean_codeforces_data(raw_data):
    """Cleans Codeforces data, focusing on performance and simplifying contest history."""
    if not raw_data:
        logger.warning("Codeforces data is empty")
        return None
        
    profile = raw_data.get("profile", {})
    
    cleaned_contests = []
    for contest in raw_data.get("contests", []):
        cleaned_contests.append({
            "contestName": contest.get("contestName"),
            "rank": contest.get("rank"),
            "oldRating": contest.get("oldRating"),
            "newRating": contest.get("newRating"),
            "ratingChange": contest.get("newRating", 0) - contest.get("oldRating", 0)
        })

    return {
        "username": profile.get("handle"),
        "rating": profile.get("rating"),
        "maxRating": profile.get("maxRating"),
        "rank": profile.get("rank"),
        "maxRank": profile.get("maxRank"),
        "contest_history": cleaned_contests,
        "problem_solving_stats": raw_data.get("solved_stats"),
        "submissions": [
            {
                "problem_name": sub.get("problem", {}).get("name"),
                "problem_tags": sub.get("problem", {}).get("tags"),
                "problem_rating": sub.get("problem", {}).get("rating"),
                "language": sub.get("programmingLanguage"),
                "verdict": sub.get("verdict")
            }
            for sub in raw_data.get("submissions", [])
        ]
    }

def clean_resume_data(raw_resume_data):
    """Processes raw resume data into final structured format"""
    if not raw_resume_data:
        logger.warning("Resume data is empty")
        return None
    
    # Extract only professional hyperlinks (filter out common non-professional links)
    professional_links = [
        url for url in raw_resume_data["hyperlinks"]
        if not re.search(r'(facebook|instagram|twitter|linkedin\.com\/in\/[^\/]+\/(detail|overlay)|youtube)', url, re.I)
    ]
    
    # Extract skills from resume text (simplified approach)
    skills = []
    skill_keywords = ['python', 'java', 'javascript', 'react', 'node', 'angular', 'vue', 'sql', 
                     'mongodb', 'aws', 'docker', 'kubernetes', 'git', 'c++', 'c#', 'typescript',
                     'html', 'css', 'spring', 'django', 'flask', 'tensorflow', 'pytorch', 'dsa',
                     'data structures', 'algorithms', 'problem solving', 'full stack', 'backend',
                     'frontend', 'mobile', 'android', 'ios', 'flutter', 'react native']
    
    resume_text = raw_resume_data["full_text"].lower()
    for keyword in skill_keywords:
        if keyword in resume_text and keyword not in skills:
            skills.append(keyword.capitalize())
    
    # Identify missing elements (simplified approach)
    missing_elements = []
    if 'projects' not in resume_text and 'project' not in resume_text:
        missing_elements.append("Projects section")
    if 'internship' not in resume_text and 'experience' not in resume_text and 'work' not in resume_text:
        missing_elements.append("Work experience")
    if 'education' not in resume_text and 'degree' not in resume_text:
        missing_elements.append("Education details")
    if len(skills) < 3:
        missing_elements.append("Technical skills listing")
    
    # Clean summary text (remove excessive whitespace and special characters)
    cleaned_summary = re.sub(r'\s{2,}', ' ', raw_resume_data["summary"])
    cleaned_summary = re.sub(r'[^\w\s.,;:!?()\-]', '', cleaned_summary)
    
    return {
        "full_text": raw_resume_data["full_text"],
        "full_text_preview": raw_resume_data["full_text"][:500] + "..." if len(raw_resume_data["full_text"]) > 500 else raw_resume_data["full_text"],
        "professional_links": professional_links,
        "skills_summary": cleaned_summary,
        "key_skills": skills,
        "total_hyperlinks": len(raw_resume_data["hyperlinks"]),
        "professional_link_count": len(professional_links),
        "missing_elements": missing_elements
    }

# --- Main Execution Logic ---

def main():
    """Main function to fetch, clean, aggregate, and save student data."""
    ipu_scraper = StudentScraper(encryption_key="Qm9sRG9OYVphcmEK")
    all_student_data = {}

    # Load existing data if output file exists
    if os.path.exists(OUTPUT_FILE):
        try:
            with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
                all_student_data = json.load(f)
            logger.info(f"Loaded existing data for {len(all_student_data)} student(s) from '{OUTPUT_FILE}'.")
        except Exception as e:
            logger.warning(f"Could not load existing data: {e}. Starting fresh.")
            all_student_data = {}
    else:
        logger.info(f"No existing output file found. Starting fresh.")

    # Get set of already processed enrollment numbers
    existing_enrollments = set(all_student_data.keys())

    # Filter STUDENTS_TO_FETCH to only include unprocessed enrollments
    students_to_process = [
        student for student in STUDENTS_TO_FETCH
        if student.get("enrollment_no") not in existing_enrollments
    ]

    if not students_to_process:
        logger.info("✅ No new students to process. All enrollments already exist.")
        return

    logger.info(f"Starting data aggregation for {len(students_to_process)} new student(s)...")

    for student in students_to_process:
        enrollment_no = student.get("enrollment_no")
        if not enrollment_no:
            logger.warning("Skipping entry due to missing enrollment number.")
            continue

        logger.info(f"\nProcessing data for Enrollment No: {enrollment_no}")

        student_record = {
            "name": None,
            "enrollment_no": enrollment_no,
            "academic_profile": None,
            "coding_profiles": {
                "leetcode": None,
                "github": None,
                "codeforces": None,
            },
            "resume": None,
            "errors": {}
        }

        # Fetch, Clean, and Assign Data
        try:
            logger.info("  - Processing IPU data...")
            raw_ipu_data = ipu_scraper.get_student_data(enrollment_no)
            student_record["academic_profile"] = clean_ipu_data(raw_ipu_data)
            if student_record["academic_profile"]:
                student_record["name"] = raw_ipu_data.get("student_info", {}).get("name")
                logger.info("    > IPU data processed successfully.")
            else:
                raise Exception("Failed to process IPU data.")
        except Exception as e:
            student_record["errors"]["ipu"] = str(e)
            logger.error(f"    > IPU processing FAILED: {e}")

        if student.get("leetcode_user"):
            try:
                logger.info(f"  - Processing LeetCode data for '{student['leetcode_user']}'...")
                raw_leetcode_result = get_leetcode_profile(student["leetcode_user"])
                if raw_leetcode_result.get("success"):
                    student_record["coding_profiles"]["leetcode"] = clean_leetcode_data(raw_leetcode_result["data"])
                    logger.info("    > LeetCode data processed successfully.")
                else:
                    raise Exception(raw_leetcode_result.get("error", "Unknown error"))
            except Exception as e:
                student_record["errors"]["leetcode"] = str(e)
                logger.error(f"    > LeetCode processing FAILED: {e}")

        if student.get("github_user"):
            try:
                logger.info(f"  - Processing GitHub data for '{student['github_user']}'...")
                raw_github_result = get_github_profile(student["github_user"])
                if raw_github_result.get("success"):
                    student_record["coding_profiles"]["github"] = clean_github_data(raw_github_result["data"])
                    logger.info("    > GitHub data processed successfully.")
                else:
                    raise Exception(raw_github_result.get("error", "Unknown error"))
            except Exception as e:
                student_record["errors"]["github"] = str(e)
                logger.error(f"    > GitHub processing FAILED: {e}")

        if student.get("codeforces_user"):
            try:
                logger.info(f"  - Processing Codeforces data for '{student['codeforces_user']}'...")
                raw_codeforces_result = get_codeforces_profile(student["codeforces_user"])
                if raw_codeforces_result.get("success"):
                    student_record["coding_profiles"]["codeforces"] = clean_codeforces_data(raw_codeforces_result["data"])
                    logger.info("    > Codeforces data processed successfully.")
                else:
                    raise Exception(raw_codeforces_result.get("error", "Unknown error"))
            except Exception as e:
                student_record["errors"]["codeforces"] = str(e)
                logger.error(f"    > Codeforces processing FAILED: {e}")

        # Process resume data
        if student.get("resume_path"):
            try:
                logger.info(f"  - Processing resume from '{student['resume_path']}'...")

                if not os.path.exists(student["resume_path"]):
                    raise FileNotFoundError(f"Resume file not found at {student['resume_path']}")

                raw_resume_data = parse_resume(student["resume_path"])
                student_record["resume"] = clean_resume_data(raw_resume_data)
                logger.info("    > Resume data processed successfully.")
            except Exception as e:
                student_record["errors"]["resume"] = str(e)
                logger.error(f"    > Resume processing FAILED: {e}")

        all_student_data[enrollment_no] = student_record
        time.sleep(1)  # Respectful delay

    # Save merged data (existing + new)
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(all_student_data, f, indent=4, ensure_ascii=False)
        logger.info(f"\n✅ Final data saved to '{OUTPUT_FILE}' ({len(all_student_data)} total students).")
    except Exception as e:
        logger.error(f"\n❌ Error saving final JSON file: {e}")

if __name__ == "__main__":
    main()