team:
  name: "Ctrl Coffee Repeat"
  primary_contact_name: "Pranjal H Dohare"
  primary_contact_email: "pranjaldohare8@gmail.com"
  primary_contact_phone: "+919320480095"
  github_repository_url: "https://github.com/Pranjal1342/Intelligent-Candidate-Discovery-Ranking-System"
  sandbox_demo_url: "https://ctrl-coffee-repeat.streamlit.app/"
  members:
    - name: "Pranjal H Dohare"
      role: "Lead Developer"
    - name: "Priyanka Tiwari"
      role: "Architecture and System Design"

submission:
  version: "1.0.0"
  timestamp: "2026-07-01"
  output_file: "CTRL_COFFEE_REPEAT.csv"

system:
  pipeline_type: "Offline-Indexed Lexical Retrieval + LightGBM LambdaRank"
  hardware: "CPU-only, ≤16GB RAM"
  runtime_seconds: 4
  network_calls_during_ranking: 0

methodology_summary: |
  This system uses a deterministic, CPU-only pipeline optimized for NDCG@10 and P@5.

  Stage 1 (Retrieval): A precomputed NumPy CSR BM25 matrix (built offline, ~40 MB) is queried
  at runtime in under 0.1 seconds via dual-pass: Pass A expands JD requirements using a
  skill alias taxonomy (skill_aliases.json), Pass B targets production-context keywords
  (deployed, scale, serving, latency). A rare-term safety net retrieves candidates with niche
  skills (pinecone, lambdarank) that might otherwise be missed. This produces a ~8,500-candidate
  Stage 1 pool in approximately 0.03 seconds.

  Stage 2 (Features): A 22-feature schema-grounded matrix extracts signals from every candidate
  record. Includes 5 adversarial detection functions: domain-category mismatch, synthetic template
  detection, production signal log-compression, LangChain dabbler detection, and CV/speech
  specialist detection. Stage 3 adds a consistency composite (c1×c2×c3×c4×c5) that zeros out
  scores for timeline impossibilities, signup anomalies, salary inversions, assessment
  contradictions, and engagement mismatches.

  Stage 4 (Ranking): LightGBM with objective=lambdarank trains on relevance labels generated
  via 2,500 pairwise LLM comparisons using Gemma3:4b-it-q4_K_M (running offline and locally
  via Ollama — zero external API calls). This explicitly breaks circularity: the LLM judges
  profiles organically without knowledge of the 22 features or BM25 scores, then Elo ratings
  are converted to 0-3 relevance labels by quartile thresholding. Candidates with data integrity
  violations are suppressed post-inference via a consistency multiplier
  (final_score = raw_score × consistency_score).

  Stage 5 (Reasoning): Deterministic grammar engine generates fact-grounded reasoning with
  numeric regex audit (all cited numbers must exist in the candidate JSON), n-gram collision
  avoidance (difflib.SequenceMatcher), and priority-ranked concern surfacing. Pre-submission
  blocking audits enforce diversity (max 25% archetype concentration, max 30% employer
  concentration) and honeypot detection (assert low_consistency_in_top100 < 10).

  Model comparison evidence: the heuristic-trained model required a hand-coded suppression list
  to keep non-technical profiles out of the top 100. The Gemma-trained model achieved 0 honeypot
  leakage with no suppression list, and the two models show Spearman correlation of 0.001 on the
  top-100 ranking — confirming the LLM labels are genuinely independent of the engineered features.

ai_tools_used:
  - tool: "Google DeepMind Antigravity"
    usage: "Code scaffolding, module structure, latency diagnostics, iterative debugging"
    human_review: true
  - tool: "Gemma3:4b-it-q4_K_M via Ollama (local, offline)"
    usage: >
      Offline pairwise candidate annotation: 2,500 comparisons on a stratified sample of
      500 Stage 1 candidates to generate non-circular LightGBM training labels.
      No candidate data transmitted to any external service. Runs in
      experiments/pairwise_llm_check/annotate_and_retrain.py, entirely separate from
      the ranking pipeline. Exempt from the 5-minute/zero-network ranking budget.
    human_review: true

reference_date: "2026-01-01"