Spaces:

Urvikava
/

Faculty-Finder-api

Sleeping

App Files Files Community

Urvikava commited on Feb 6

Commit

997f52c

verified ·

1 Parent(s): e06a84d

Upload 54 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
Dockerfile +28 -0
api/.env +4 -0
api/__pycache__/main.cpython-312.pyc +0 -0
api/__pycache__/main.cpython-313.pyc +0 -0
api/__pycache__/schema.cpython-312.pyc +0 -0
api/main.py +126 -0
api/schema.py +25 -0
cleaning/__init__.py +0 -0
cleaning/__pycache__/__init__.cpython-312.pyc +0 -0
cleaning/__pycache__/clean_faculty_records.cpython-312.pyc +0 -0
cleaning/clean_faculty_records.py +100 -0
data/processed/clean_faculty_data.csv +0 -0
data/raw/raw_faculty_data.csv +0 -0
ingestion/__pycache__/discover_urls.cpython-312.pyc +0 -0
ingestion/__pycache__/http_client.cpython-312.pyc +0 -0
ingestion/__pycache__/scrape_faculty.cpython-312.pyc +0 -0
ingestion/__pycache__/section_parser.cpython-312.pyc +0 -0
ingestion/__pycache__/utils.cpython-312.pyc +0 -0
ingestion/discover_urls.py +69 -0
ingestion/http_client.py +52 -0
ingestion/scrape_faculty.py +84 -0
rag/.env +4 -0
rag/__pycache__/step_2_authority_scoring.cpython-312.pyc +0 -0
rag/__pycache__/step_2_bm25_retrieval.cpython-312.pyc +0 -0
rag/__pycache__/step_4_semantic_retrieval.cpython-312.pyc +0 -0
rag/__pycache__/step_5_hybrid_retrieval.cpython-312.pyc +0 -0
rag/__pycache__/step_6_llm_explainability.cpython-312.pyc +0 -0
rag/__pycache__/utils.cpython-312.pyc +0 -0
rag/artifacts/bm25_index.pkl +3 -0
rag/artifacts/faculty_documents.json +0 -0
rag/artifacts/faculty_evidence_units.json +0 -0
rag/step_1_text_construction.py +54 -0
rag/step_2_bm25_retrieval.py +61 -0
rag/step_3_semantic_index.py +40 -0
rag/step_4_semantic_retrieval.py +55 -0
rag/step_5_hybrid_retrieval.py +96 -0
rag/step_6_llm_explainability.py +116 -0
rag/utils.py +11 -0
rag/vector_store/chroma_evidence/chroma.sqlite3 +3 -0
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/data_level0.bin +3 -0
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/header.bin +3 -0
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/length.bin +3 -0
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/link_lists.bin +3 -0
requirements.txt +11 -0
storage/__pycache__/db.cpython-312.pyc +0 -0
storage/__pycache__/db.cpython-313.pyc +0 -0
storage/__pycache__/fetch_faculty.cpython-312.pyc +0 -0
storage/__pycache__/insert_faculty.cpython-312.pyc +0 -0
storage/db.py +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+rag/vector_store/chroma_evidence/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+storage/faculty.db filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11-slim
+# Prevent Python from writing pyc files
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Set working directory
+WORKDIR /app
+# Install system dependencies (important for chroma + sqlite)
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    sqlite3 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first (better caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy project files
+COPY . .
+# Expose port (Hugging Face uses 7860)
+EXPOSE 7860
+# Start FastAPI with uvicorn
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

api/.env ADDED Viewed

	@@ -0,0 +1,4 @@

+GOOGLE_API_KEY = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE"
+OPENROUTER_API_KEY = "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"
+OPENAI_API_KEY = "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"
+GROQ_API_KEY = "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"

api/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (4.64 kB). View file

api/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (2.28 kB). View file

api/__pycache__/schema.cpython-312.pyc ADDED Viewed

Binary file (1.24 kB). View file

api/main.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from dotenv import load_dotenv
+load_dotenv()
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from api.schema import SearchRequest, SearchResponse
+from rag.step_5_hybrid_retrieval import hybrid_retrieve
+from rag.step_6_llm_explainability import explain_and_rerank
+from storage.fetch_faculty import fetch_faculty_by_id
+from storage.db import get_connection
+app = FastAPI(
+    title="Faculty Finder API",
+    description="Student-centric faculty recommendation system using hybrid retrieval and LLM reasoning",
+    version="1.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Adding endpoint to check health of the API
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+# Adding endpoint to get all faculty
+@app.get('/faculty')
+def get_all_faculty():
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute("SELECT * FROM faculty")
+    rows = cur.fetchall()
+    conn.close()
+    return [dict(row) for row in rows]
+# Adding endpoint to get faculty by id
+@app.get("/faculty/{faculty_id}")
+def get_faculty_by_id(faculty_id: int):
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute(
+        "SELECT * FROM faculty WHERE faculty_id = ?",
+        (faculty_id,)
+    )
+    row = cur.fetchone()
+    conn.close()
+    if row is None:
+        raise HTTPException(status_code=404, detail="Faculty not found")
+    return dict(row)
+# Adding endpoint of filter by category
+@app.get("/faculty/category/{category}")
+def get_faculty_by_category(category: str):
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute(
+        "SELECT * FROM faculty WHERE faculty_category = ?",
+        (category,)
+    )
+    rows = cur.fetchall()
+    conn.close()
+    return [dict(row) for row in rows]
+@app.post("/search", response_model=SearchResponse)
+def search_faculty(request: SearchRequest):
+    try:
+        # Step 1: Hybrid Retrieval
+        hybrid_results = hybrid_retrieve(
+            query=request.query,
+            top_k=request.top_k
+        )
+        # Step 2: LLM Reranking + Explainability
+        llm_results = explain_and_rerank(
+            request.query,
+            hybrid_results
+        )
+        # Step 3: Enrich from Database
+        final_results = []
+        for item in llm_results:
+            faculty = fetch_faculty_by_id(item["faculty_id"])
+            if faculty is None:
+                continue
+            final_results.append({
+                "rank": item["rank"],
+                "faculty_id": faculty["faculty_id"],
+                "name": faculty["name"],
+                "category": faculty["faculty_category"],
+                "reason": item["reason"],
+                "image_url": faculty["image_url"],
+                "education": faculty["education"],
+                "phone": faculty["phone"],
+                "email": faculty["email"],
+                "address": faculty["address"],
+            })
+        return {
+            "query": request.query,
+            "results": final_results
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

api/schema.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from pydantic import BaseModel
+from typing import List, Optional
+class SearchRequest(BaseModel):
+    query: str
+    top_k: int = 5
+class FacultyResult(BaseModel):
+    rank: int
+    faculty_id: int
+    name: str
+    category: str
+    reason: str
+    image_url: Optional[str]
+    education: Optional[str]
+    phone: Optional[str]
+    email: Optional[str]
+    address: Optional[str]
+class SearchResponse(BaseModel):
+    query: str
+    results: List[FacultyResult]

cleaning/__init__.py ADDED Viewed

File without changes

cleaning/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (175 Bytes). View file

cleaning/__pycache__/clean_faculty_records.cpython-312.pyc ADDED Viewed

Binary file (3.36 kB). View file

cleaning/clean_faculty_records.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import re
+import html
+from bs4 import BeautifulSoup
+from storage.db import get_connection
+def clean_html_field(raw_html: str) -> str | None:
+    """
+    Clean HTML-heavy fields:
+    biography, specialization, research, publications, teaching
+    """
+    if not raw_html:
+        return None
+    # Decode HTML entities
+    raw_html = html.unescape(raw_html)
+    soup = BeautifulSoup(raw_html, "lxml")
+    # Remove noisy / non-semantic tags
+    for tag in soup(["script", "style", "table", "sup"]):
+        tag.decompose()
+    text = soup.get_text(separator=" ")
+    # Normalize whitespace
+    text = re.sub(r"\s+", " ", text)
+    return text.strip() if text.strip() else None
+def clean_plain_text(text: str) -> str | None:
+    """
+    Clean already-plain text fields:
+    education, address
+    """
+    if not text:
+        return None
+    text = html.unescape(text)
+    text = text.replace("\u00a0", " ")
+    text = re.sub(r"\s+", " ", text)
+    return text.strip() if text.strip() else None
+def clean_all_faculty_fields():
+    """
+    Clean ALL faculty fields IN-PLACE.
+    No schema change. No new columns.
+    """
+    conn = get_connection()
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT
+            faculty_id,
+            biography,
+            specialization,
+            research,
+            publications,
+            teaching,
+            education,
+            address
+        FROM faculty
+    """)
+    rows = cur.fetchall()
+    for row in rows:
+        cur.execute("""
+            UPDATE faculty
+            SET
+                biography = ?,
+                specialization = ?,
+                research = ?,
+                publications = ?,
+                teaching = ?,
+                education = ?,
+                address = ?
+            WHERE faculty_id = ?
+        """, (
+            clean_html_field(row["biography"]),
+            clean_html_field(row["specialization"]),
+            clean_html_field(row["research"]),
+            clean_html_field(row["publications"]),
+            clean_html_field(row["teaching"]),
+            clean_plain_text(row["education"]),
+            clean_plain_text(row["address"]),
+            row["faculty_id"]
+        ))
+    conn.commit()
+    conn.close()
+    print("[CLEANING] All faculty fields cleaned successfully.")
+if __name__ == "__main__":
+    clean_all_faculty_fields()

data/processed/clean_faculty_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/raw/raw_faculty_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

ingestion/__pycache__/discover_urls.cpython-312.pyc ADDED Viewed

Binary file (2.61 kB). View file

ingestion/__pycache__/http_client.cpython-312.pyc ADDED Viewed

Binary file (1.89 kB). View file

ingestion/__pycache__/scrape_faculty.cpython-312.pyc ADDED Viewed

Binary file (3.37 kB). View file

ingestion/__pycache__/section_parser.cpython-312.pyc ADDED Viewed

Binary file (1.58 kB). View file

ingestion/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (382 Bytes). View file

ingestion/discover_urls.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from ingestion.http_client import get_session
+BASE_URL = "https://www.daiict.ac.in"
+SEED_URLS = {
+    "regular_faculty": "/faculty",
+    "adjunct_faculty": "/adjunct-faculty",
+    "adjunct_faculty_international": "/adjunct-faculty-international",
+    "distinguished_professor": "/distinguished-professor",
+    "professor_of_practice": "/professor-practice"
+}
+PROFILE_PREFIXES = [
+    "/faculty/",
+    "/adjunct-faculty/",
+    "/adjunct-faculty-international/",
+    "/distinguished-professor/",
+    "/professor-practice/"
+]
+session = get_session()
+def discover_faculty_urls():
+    discovered = {}
+    for category, path in SEED_URLS.items():
+        seed_url = urljoin(BASE_URL, path)
+        print(f"[INFO] Crawling {seed_url}")
+        resp = session.get(seed_url)
+        if resp.status_code != 200:
+            print(f"[WARN] Failed to fetch {seed_url}: {resp.status_code}")
+            continue
+        soup = BeautifulSoup(resp.text, "lxml")
+        # Changes done in it by Harsh
+        for a in soup.find_all("a", href=True):
+            raw_href = a["href"]
+            # Normalize URL (handles relative + absolute)
+            full_url = urljoin(BASE_URL, raw_href)
+            parsed = urlparse(full_url)
+            # Only accept DAIICT internal links
+            if parsed.netloc != "www.daiict.ac.in":
+                continue
+            # Check faculty profile path
+            if any(parsed.path.startswith(prefix) for prefix in PROFILE_PREFIXES):
+                if full_url not in discovered:
+                    discovered[full_url] = {
+                        "profile_url": full_url,
+                        # CATEGORY COMES FROM SOURCE PAGE
+                        "faculty_category": category
+                    }
+    return list(discovered.values())
+if __name__ == "__main__":
+    urls = discover_faculty_urls()
+    print(f"\nDiscovered {len(urls)} faculty profiles\n")
+    for u in urls:
+        print(u)

ingestion/http_client.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+DEFAULT_HEADERS = {
+    "User-Agent": "FacultyFinderBot/1.0"
+}
+def get_session(
+    total_retries: int = 3,
+    backoff_factor: float = 1.0,
+    timeout: int = 15
+):
+    """
+    Returns a requests.Session configured with retry and exponential backoff.
+    Retries on:
+    - Connection errors
+    - HTTP 500, 502, 503, 504
+    Backoff pattern:
+    1s → 2s → 4s
+    """
+    session = requests.Session()
+    session.headers.update(DEFAULT_HEADERS)
+    retries = Retry(
+        total=total_retries,
+        connect=total_retries,
+        read=total_retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=[500, 502, 503, 504],
+        allowed_methods=["GET"],
+        raise_on_status=False
+    )
+    adapter = HTTPAdapter(max_retries=retries)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    # Attach timeout as a session attribute (clean pattern)
+    session.request = _inject_timeout(session.request, timeout)
+    return session
+def _inject_timeout(request_func, timeout):
+    def wrapper(*args, **kwargs):
+        kwargs.setdefault("timeout", timeout)
+        return request_func(*args, **kwargs)
+    return wrapper

ingestion/scrape_faculty.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from ingestion.http_client import get_session
+from bs4 import BeautifulSoup
+BASE_URL = "https://www.daiict.ac.in"
+session = get_session()
+def clean_text(text):
+    if not text:
+        return None
+    return " ".join(text.split())
+def scrape_faculty_profile(profile_url, faculty_category):
+    resp = session.get(profile_url)
+    if resp.status_code != 200:
+        raise RuntimeError(
+            f"Failed to fetch profile ({resp.status_code}): {profile_url}"
+        )
+    soup = BeautifulSoup(resp.text, "lxml")
+    # ---------- BASIC INFO ----------
+    name_tag = soup.select_one("div.field--name-field-faculty-names")
+    name = clean_text(name_tag.get_text()) if name_tag else None
+    img_tag = soup.select_one("div.field--name-field-faculty-image img")
+    image_url = BASE_URL + img_tag["src"] if img_tag else None
+    education = clean_text(
+        soup.select_one("div.field--name-field-faculty-name").get_text()
+        if soup.select_one("div.field--name-field-faculty-name")
+        else None
+    )
+    phone = clean_text(
+        soup.select_one("div.field--name-field-contact-no").get_text()
+        if soup.select_one("div.field--name-field-contact-no")
+        else None
+    )
+    address = clean_text(
+        soup.select_one("div.field--name-field-address").get_text()
+        if soup.select_one("div.field--name-field-address")
+        else None
+    )
+    email = clean_text(
+        soup.select_one("div.field--name-field-email div.field__item").get_text()
+        if soup.select_one("div.field--name-field-email div.field__item")
+        else None
+    )
+    # ---------- RAW HTML SECTIONS (WITH TAGS) ----------
+    biography_block = soup.select_one("div.field--name-field-biography")
+    biography = biography_block.decode_contents() if biography_block else None
+    specialization_block = soup.select_one("div.specializationIcon + div.work-exp")
+    specialization = specialization_block.decode_contents() if specialization_block else None
+    teaching_block = soup.select_one("div.field--name-field-teaching")
+    teaching = teaching_block.decode_contents() if teaching_block else None
+    research_block = soup.select_one("div.work-exp1 div.field--type-text-with-summary")
+    research = research_block.decode_contents() if research_block else None
+    publications_block = soup.select_one("div.education.overflowContent")
+    publications = publications_block.decode_contents() if publications_block else None
+    return {
+        "name": name,
+        "profile_url": profile_url,
+        "faculty_category": faculty_category,
+        "image_url": image_url,
+        "education": education,
+        "phone": phone,
+        "address": address,
+        "email": email,
+        "biography": biography,
+        "specialization": specialization,
+        "teaching": teaching,
+        "research": research,
+        "publications": publications
+    }

rag/.env ADDED Viewed

	@@ -0,0 +1,4 @@

+GOOGLE_API_KEY = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE"
+OPENROUTER_API_KEY = "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"
+OPENAI_API_KEY = "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"
+GROQ_API_KEY = "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"

rag/__pycache__/step_2_authority_scoring.cpython-312.pyc ADDED Viewed

Binary file (1.78 kB). View file

rag/__pycache__/step_2_bm25_retrieval.cpython-312.pyc ADDED Viewed

Binary file (2.47 kB). View file

rag/__pycache__/step_4_semantic_retrieval.cpython-312.pyc ADDED Viewed

Binary file (2.24 kB). View file

rag/__pycache__/step_5_hybrid_retrieval.cpython-312.pyc ADDED Viewed

Binary file (4.11 kB). View file

rag/__pycache__/step_6_llm_explainability.cpython-312.pyc ADDED Viewed

Binary file (3.58 kB). View file

rag/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (894 Bytes). View file

rag/artifacts/bm25_index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2dbb4bf10c5e4b23566c2a67a1e3f1ce5488b7561b31eac43b0df9e944038ca
+size 711889

rag/artifacts/faculty_documents.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rag/artifacts/faculty_evidence_units.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rag/step_1_text_construction.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import json
+import sqlite3
+from pathlib import Path
+BASE_DIR = Path(__file__).resolve().parents[1]
+DB_PATH = BASE_DIR / "storage" / "faculty.db"
+OUTPUT_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_evidence_units.json"
+FIELDS = [
+    ("research", "research"),
+    ("publications", "publications"),
+    ("teaching", "teaching"),
+    ("biography", "biography"),
+    ("education", "education"),
+    ("specialization", "specialization")
+]
+def fetch_faculty_data():
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    cur = conn.cursor()
+    cur.execute("SELECT * FROM faculty")
+    rows = cur.fetchall()
+    conn.close()
+    return rows
+def build_evidence_units():
+    faculty_rows = fetch_faculty_data()
+    evidence_units = []
+    for row in faculty_rows:
+        for field_name, column in FIELDS:
+            text = row[column]
+            if text and len(text.strip()) > 30:
+                evidence_units.append({
+                    "faculty_id": row["faculty_id"],
+                    "name": row["name"],
+                    "faculty_category": row["faculty_category"],
+                    "field": field_name,
+                    "text": text.strip()
+                })
+    return evidence_units
+if __name__ == "__main__":
+    evidence_units = build_evidence_units()
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(evidence_units, f, indent=2)
+    print(f"[STEP 1 COMPLETE] Generated {len(evidence_units)} evidence units")
+    print(f"[OUTPUT] {OUTPUT_PATH}")

rag/step_2_bm25_retrieval.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import json
+from pathlib import Path
+from rank_bm25 import BM25Okapi
+import re
+# Config & Path
+BASE_DIR = Path(__file__).resolve().parents[1]
+DOCS_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_documents.json"
+# Utility Functions
+def tokenize(text: str):
+    text = text.lower()
+    text = re.sub(r"[^a-z0-9\s]", " ", text)
+    return text.split()
+# Load Faculty Documents
+with open(DOCS_PATH, "r", encoding="utf-8") as f:
+    faculty_docs = json.load(f)
+corpus = [tokenize(doc["text"]) for doc in faculty_docs]
+bm25 = BM25Okapi(corpus)
+# Retrieving Function - BM25 Retriever
+def bm25_retrieve(query: str, top_k: int = 10):
+    query_tokens = tokenize(query)
+    scores = bm25.get_scores(query_tokens)
+    ranked = sorted(
+        zip(faculty_docs, scores),
+        key=lambda x: x[1],
+        reverse=True
+    )
+    results = []
+    for doc, score in ranked[:top_k]:
+        results.append({
+            "faculty_id": doc["faculty_id"],
+            "name": doc["name"],
+            "faculty_category": doc["faculty_category"],
+            "bm25_score": round(float(score), 4)
+        })
+    return results
+# Main Function
+if __name__ == "__main__":
+    query = "Natural Language Processing"
+    results = bm25_retrieve(query)
+    print(f"\nBM25 Results for query: '{query}'\n")
+    for r in results:
+        print(
+            f"{r['name']} | "
+            f"Category: {r['faculty_category']} | "
+            f"Score: {r['bm25_score']}"
+        )

rag/step_3_semantic_index.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+from pathlib import Path
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+BASE_DIR = Path(__file__).resolve().parents[1]
+EVIDENCE_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_evidence_units.json"
+VECTOR_DIR = BASE_DIR / "rag" / "vector_store" / "chroma_evidence"
+def load_evidence_units():
+    with open(EVIDENCE_PATH, "r", encoding="utf-8") as f:
+        return json.load(f)
+if __name__ == "__main__":
+    evidence_units = load_evidence_units()
+    texts = [e["text"] for e in evidence_units]
+    metadatas = [
+        {
+            "faculty_id": e["faculty_id"],
+            "field": e["field"],
+            "name": e["name"],
+            "faculty_category": e["faculty_category"]
+        }
+        for e in evidence_units
+    ]
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    vectorstore = Chroma.from_texts(
+        texts=texts,
+        metadatas=metadatas,
+        embedding=embeddings,
+        persist_directory=str(VECTOR_DIR)
+    )
+    print(f"[STEP 3 COMPLETE] Indexed {len(texts)} evidence units")
+    print(f"[VECTOR STORE] {VECTOR_DIR}")

rag/step_4_semantic_retrieval.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pathlib import Path
+from collections import defaultdict
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+BASE_DIR = Path(__file__).resolve().parents[1]
+VECTOR_DIR = BASE_DIR / "rag" / "vector_store" / "chroma_evidence"
+FIELD_WEIGHTS = {
+    "research": 0.4,
+    "publications": 0.3,
+    "teaching": 0.2,
+    "biography": 0.1,
+    "education": 0.1
+}
+def semantic_retrieve(query, top_k=20):
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+    vectorstore = Chroma(
+        persist_directory=str(VECTOR_DIR),
+        embedding_function=embeddings
+    )
+    results = vectorstore.similarity_search_with_score(query, k=top_k)
+    faculty_scores = defaultdict(float)
+    faculty_meta = {}
+    for doc, score in results:
+        meta = doc.metadata
+        field = meta["field"]
+        weight = FIELD_WEIGHTS.get(field, 0.1)
+        faculty_id = meta["faculty_id"]
+        faculty_scores[faculty_id] += (1 - score) * weight
+        faculty_meta[faculty_id] = meta
+    ranked = sorted(
+        faculty_scores.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )
+    return [
+        {
+            "faculty_id": fid,
+            "name": faculty_meta[fid]["name"],
+            "faculty_category": faculty_meta[fid]["faculty_category"],
+            "semantic_score": round(score, 4)
+        }
+        for fid, score in ranked
+    ]

rag/step_5_hybrid_retrieval.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from rag.step_2_bm25_retrieval import bm25_retrieve
+from rag.step_4_semantic_retrieval import semantic_retrieve
+from rag.step_6_llm_explainability import explain_and_rerank
+from rag.utils import load_faculty_documents
+# -------------------------
+# NORMALIZATION FUNCTION
+# -------------------------
+def normalize(scores):
+    min_s = min(scores)
+    max_s = max(scores)
+    if max_s == min_s:
+        return [1.0] * len(scores)
+    return [(s - min_s) / (max_s - min_s) for s in scores]
+# -------------------------
+# HYBRID RETRIEVAL
+# -------------------------
+def hybrid_retrieve(query, top_k=10, alpha=0.6):
+    """
+    alpha = weight for semantic score
+    (1 - alpha) = weight for BM25 score
+    """
+    bm25_results = bm25_retrieve(query, top_k=top_k * 2)
+    semantic_results = semantic_retrieve(query, top_k=top_k * 2)
+    bm25_dict = {r["faculty_id"]: r for r in bm25_results}
+    semantic_dict = {r["faculty_id"]: r for r in semantic_results}
+    faculty_ids = set(bm25_dict) | set(semantic_dict)
+    bm25_scores = [bm25_dict.get(fid, {}).get("bm25_score", 0.0) for fid in faculty_ids]
+    semantic_scores = [semantic_dict.get(fid, {}).get("semantic_score", 0.0) for fid in faculty_ids]
+    bm25_norm = normalize(bm25_scores)
+    semantic_norm = normalize(semantic_scores)
+    fused_results = []
+    for fid, b_score, s_score in zip(faculty_ids, bm25_norm, semantic_norm):
+        fused_score = (1 - alpha) * b_score + alpha * s_score
+        source = bm25_dict.get(fid) or semantic_dict.get(fid)
+        fused_results.append({
+            "faculty_id": fid,
+            "name": source["name"],
+            "faculty_category": source["faculty_category"],
+            "final_score": round(fused_score, 4)
+        })
+    fused_results.sort(key=lambda x: x["final_score"], reverse=True)
+    return fused_results[:top_k]
+# -------------------------
+# MAIN PIPELINE
+# -------------------------
+if __name__ == "__main__":
+    query = "Natural Language Processing"
+    # Step 5: Hybrid Retrieval
+    hybrid_results = hybrid_retrieve(query, top_k=5)
+    print("\n--- Hybrid Retrieval Results ---\n")
+    for r in hybrid_results:
+        print(
+            f"{r['name']} | "
+            f"{r['faculty_category']} | "
+            f"Score: {r['final_score']}"
+        )
+    # Load faculty documents for context
+    faculty_docs = load_faculty_documents()
+    enriched_results = []
+    for r in hybrid_results:
+        doc = faculty_docs.get(r["faculty_id"], {})
+        enriched_results.append({**r, **doc})
+    # Step 6: LLM Reranking + Explainability
+    print("\n--- LLM Reranked & Explained Results ---\n")
+    llm_results = explain_and_rerank(query, enriched_results)
+    for r in llm_results:
+        print(f"Rank {r['rank']}: {r['name']} ({r['category']})")
+        print(f"Reason: {r['reason']}")
+        print("-" * 60)

rag/step_6_llm_explainability.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import PromptTemplate
+import json
+import re
+llm = ChatGoogleGenerativeAI(
+    api_key = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE",
+    model="gemini-2.5-flash-lite",
+    temperature=0.2
+)
+PROMPT = PromptTemplate(
+    input_variables=["query", "candidates"],
+    template="""
+You are an academic mentor advising a university student.
+Student Query:
+"{query}"
+Below is a list of faculty candidates.
+Each candidate has a UNIQUE faculty_id.
+Candidates:
+{candidates}
+Your task:
+1. Rank ALL faculty from best to worst.
+2. For EACH faculty, explain:
+   - Alignment with the student's interest
+   - What the student gains
+   - Any limitations (adjunct role, availability, etc.)
+3. Use a student-friendly advisory tone.
+4. DO NOT mention scores.
+5. RETURN STRICT JSON ONLY.
+6. DO NOT invent or change faculty_id.
+Required JSON format:
+[
+  {{
+    "rank": 1,
+    "faculty_id": 48,
+    "reason": "Student-focused explanation (3–4 lines)"
+  }}
+]
+"""
+)
+# ---------- JSON SAFE PARSER ----------
+def _extract_json(text: str):
+    try:
+        text = text.replace("```json", "").replace("```", "").strip()
+        start = text.find("[")
+        end = text.rfind("]") + 1
+        if start == -1 or end == -1:
+            return None
+        return json.loads(text[start:end])
+    except Exception:
+        return None
+# ---------- MAIN ENTRY ----------
+def explain_and_rerank(query, hybrid_results):
+    # Build ID-anchored candidate list
+    candidates_text = "\n".join(
+        f"- faculty_id:{r['faculty_id']} | {r['name']} ({r['faculty_category']})"
+        for r in hybrid_results
+    )
+    prompt = PROMPT.format(
+        query=query,
+        candidates=candidates_text
+    )
+    response = llm.invoke(prompt)
+    raw_output = response.content.strip()
+    parsed = _extract_json(raw_output)
+    # 🔒 HARD FALLBACK (never break API)
+    if parsed is None:
+        return [
+            {
+                "rank": idx + 1,
+                "faculty_id": r["faculty_id"],
+                "name": r["name"],
+                "category": r["faculty_category"],
+                "reason": "AI explanation unavailable. Ranked based on hybrid relevance score."
+            }
+            for idx, r in enumerate(hybrid_results)
+        ]
+    # Build lookup tables
+    faculty_map = {
+        r["faculty_id"]: r
+        for r in hybrid_results
+    }
+    final = []
+    for item in parsed:
+        faculty_id = item.get("faculty_id")
+        if faculty_id not in faculty_map:
+            continue
+        faculty = faculty_map[faculty_id]
+        final.append({
+            "rank": item["rank"],
+            "faculty_id": faculty_id,
+            "name": faculty["name"],
+            "category": faculty["faculty_category"],
+            "reason": item["reason"]
+        })
+    return final

rag/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import json
+from pathlib import Path
+BASE_DIR = Path(__file__).resolve().parents[1]
+DOCS_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_documents.json"
+def load_faculty_documents():
+    with open(DOCS_PATH, "r", encoding="utf-8") as f:
+        docs = json.load(f)
+    return {d["faculty_id"]: d for d in docs}

rag/vector_store/chroma_evidence/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0f499b5545c1322806c433823e52d2c96f01af67e2c37e7cc2026796818d3e
+size 7766016

rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8de47b1f77fc53b981dc21ddc4bac47721fad56a124537ac8bf6090a80ed778f
+size 167600

rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
+size 100

rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc151afd6f6ddb991c87835b97ff475e8e67b77ededde812d976ed2ad93e848
+size 400

rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn
+python-dotenv
+pydantic
+rank-bm25
+langchain
+langchain-community
+langchain-huggingface
+langchain-google-genai
+chromadb
+sentence-transformers

storage/__pycache__/db.cpython-312.pyc ADDED Viewed

Binary file (672 Bytes). View file

storage/__pycache__/db.cpython-313.pyc ADDED Viewed

Binary file (674 Bytes). View file

storage/__pycache__/fetch_faculty.cpython-312.pyc ADDED Viewed

Binary file (1.22 kB). View file

storage/__pycache__/insert_faculty.cpython-312.pyc ADDED Viewed

Binary file (1.57 kB). View file

storage/db.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import sqlite3
+from pathlib import Path
+DB_PATH = Path(__file__).resolve().parents[1] / "storage" / "faculty.db"
+def get_connection():
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    return conn