Urvikava commited on
Commit
997f52c
·
verified ·
1 Parent(s): e06a84d

Upload 54 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. Dockerfile +28 -0
  3. api/.env +4 -0
  4. api/__pycache__/main.cpython-312.pyc +0 -0
  5. api/__pycache__/main.cpython-313.pyc +0 -0
  6. api/__pycache__/schema.cpython-312.pyc +0 -0
  7. api/main.py +126 -0
  8. api/schema.py +25 -0
  9. cleaning/__init__.py +0 -0
  10. cleaning/__pycache__/__init__.cpython-312.pyc +0 -0
  11. cleaning/__pycache__/clean_faculty_records.cpython-312.pyc +0 -0
  12. cleaning/clean_faculty_records.py +100 -0
  13. data/processed/clean_faculty_data.csv +0 -0
  14. data/raw/raw_faculty_data.csv +0 -0
  15. ingestion/__pycache__/discover_urls.cpython-312.pyc +0 -0
  16. ingestion/__pycache__/http_client.cpython-312.pyc +0 -0
  17. ingestion/__pycache__/scrape_faculty.cpython-312.pyc +0 -0
  18. ingestion/__pycache__/section_parser.cpython-312.pyc +0 -0
  19. ingestion/__pycache__/utils.cpython-312.pyc +0 -0
  20. ingestion/discover_urls.py +69 -0
  21. ingestion/http_client.py +52 -0
  22. ingestion/scrape_faculty.py +84 -0
  23. rag/.env +4 -0
  24. rag/__pycache__/step_2_authority_scoring.cpython-312.pyc +0 -0
  25. rag/__pycache__/step_2_bm25_retrieval.cpython-312.pyc +0 -0
  26. rag/__pycache__/step_4_semantic_retrieval.cpython-312.pyc +0 -0
  27. rag/__pycache__/step_5_hybrid_retrieval.cpython-312.pyc +0 -0
  28. rag/__pycache__/step_6_llm_explainability.cpython-312.pyc +0 -0
  29. rag/__pycache__/utils.cpython-312.pyc +0 -0
  30. rag/artifacts/bm25_index.pkl +3 -0
  31. rag/artifacts/faculty_documents.json +0 -0
  32. rag/artifacts/faculty_evidence_units.json +0 -0
  33. rag/step_1_text_construction.py +54 -0
  34. rag/step_2_bm25_retrieval.py +61 -0
  35. rag/step_3_semantic_index.py +40 -0
  36. rag/step_4_semantic_retrieval.py +55 -0
  37. rag/step_5_hybrid_retrieval.py +96 -0
  38. rag/step_6_llm_explainability.py +116 -0
  39. rag/utils.py +11 -0
  40. rag/vector_store/chroma_evidence/chroma.sqlite3 +3 -0
  41. rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/data_level0.bin +3 -0
  42. rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/header.bin +3 -0
  43. rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/length.bin +3 -0
  44. rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/link_lists.bin +3 -0
  45. requirements.txt +11 -0
  46. storage/__pycache__/db.cpython-312.pyc +0 -0
  47. storage/__pycache__/db.cpython-313.pyc +0 -0
  48. storage/__pycache__/fetch_faculty.cpython-312.pyc +0 -0
  49. storage/__pycache__/insert_faculty.cpython-312.pyc +0 -0
  50. storage/db.py +9 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ rag/vector_store/chroma_evidence/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ storage/faculty.db filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Prevent Python from writing pyc files
4
+ ENV PYTHONDONTWRITEBYTECODE=1
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ # Set working directory
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies (important for chroma + sqlite)
11
+ RUN apt-get update && apt-get install -y \
12
+ build-essential \
13
+ sqlite3 \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy requirements first (better caching)
17
+ COPY requirements.txt .
18
+
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy project files
22
+ COPY . .
23
+
24
+ # Expose port (Hugging Face uses 7860)
25
+ EXPOSE 7860
26
+
27
+ # Start FastAPI with uvicorn
28
+ CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]
api/.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GOOGLE_API_KEY = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE"
2
+ OPENROUTER_API_KEY = "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"
3
+ OPENAI_API_KEY = "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"
4
+ GROQ_API_KEY = "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"
api/__pycache__/main.cpython-312.pyc ADDED
Binary file (4.64 kB). View file
 
api/__pycache__/main.cpython-313.pyc ADDED
Binary file (2.28 kB). View file
 
api/__pycache__/schema.cpython-312.pyc ADDED
Binary file (1.24 kB). View file
 
api/main.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from api.schema import SearchRequest, SearchResponse
9
+ from rag.step_5_hybrid_retrieval import hybrid_retrieve
10
+ from rag.step_6_llm_explainability import explain_and_rerank
11
+ from storage.fetch_faculty import fetch_faculty_by_id
12
+ from storage.db import get_connection
13
+
14
+ app = FastAPI(
15
+ title="Faculty Finder API",
16
+ description="Student-centric faculty recommendation system using hybrid retrieval and LLM reasoning",
17
+ version="1.0"
18
+ )
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+
29
+ # Adding endpoint to check health of the API
30
+ @app.get("/health")
31
+ def health_check():
32
+ return {"status": "ok"}
33
+
34
+
35
+ # Adding endpoint to get all faculty
36
+ @app.get('/faculty')
37
+ def get_all_faculty():
38
+ conn = get_connection()
39
+ cur = conn.cursor()
40
+
41
+ cur.execute("SELECT * FROM faculty")
42
+ rows = cur.fetchall()
43
+ conn.close()
44
+
45
+ return [dict(row) for row in rows]
46
+
47
+
48
+ # Adding endpoint to get faculty by id
49
+ @app.get("/faculty/{faculty_id}")
50
+ def get_faculty_by_id(faculty_id: int):
51
+ conn = get_connection()
52
+ cur = conn.cursor()
53
+
54
+ cur.execute(
55
+ "SELECT * FROM faculty WHERE faculty_id = ?",
56
+ (faculty_id,)
57
+ )
58
+ row = cur.fetchone()
59
+ conn.close()
60
+
61
+ if row is None:
62
+ raise HTTPException(status_code=404, detail="Faculty not found")
63
+
64
+ return dict(row)
65
+
66
+
67
+ # Adding endpoint of filter by category
68
+ @app.get("/faculty/category/{category}")
69
+ def get_faculty_by_category(category: str):
70
+ conn = get_connection()
71
+ cur = conn.cursor()
72
+
73
+ cur.execute(
74
+ "SELECT * FROM faculty WHERE faculty_category = ?",
75
+ (category,)
76
+ )
77
+ rows = cur.fetchall()
78
+ conn.close()
79
+
80
+ return [dict(row) for row in rows]
81
+
82
+ @app.post("/search", response_model=SearchResponse)
83
+ def search_faculty(request: SearchRequest):
84
+ try:
85
+ # Step 1: Hybrid Retrieval
86
+ hybrid_results = hybrid_retrieve(
87
+ query=request.query,
88
+ top_k=request.top_k
89
+ )
90
+
91
+ # Step 2: LLM Reranking + Explainability
92
+ llm_results = explain_and_rerank(
93
+ request.query,
94
+ hybrid_results
95
+ )
96
+
97
+ # Step 3: Enrich from Database
98
+ final_results = []
99
+
100
+ for item in llm_results:
101
+ faculty = fetch_faculty_by_id(item["faculty_id"])
102
+
103
+ if faculty is None:
104
+ continue
105
+
106
+ final_results.append({
107
+ "rank": item["rank"],
108
+ "faculty_id": faculty["faculty_id"],
109
+ "name": faculty["name"],
110
+ "category": faculty["faculty_category"],
111
+ "reason": item["reason"],
112
+
113
+ "image_url": faculty["image_url"],
114
+ "education": faculty["education"],
115
+ "phone": faculty["phone"],
116
+ "email": faculty["email"],
117
+ "address": faculty["address"],
118
+ })
119
+
120
+ return {
121
+ "query": request.query,
122
+ "results": final_results
123
+ }
124
+
125
+ except Exception as e:
126
+ raise HTTPException(status_code=500, detail=str(e))
api/schema.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional
3
+
4
+ class SearchRequest(BaseModel):
5
+ query: str
6
+ top_k: int = 5
7
+
8
+
9
+ class FacultyResult(BaseModel):
10
+ rank: int
11
+ faculty_id: int
12
+ name: str
13
+ category: str
14
+ reason: str
15
+
16
+ image_url: Optional[str]
17
+ education: Optional[str]
18
+ phone: Optional[str]
19
+ email: Optional[str]
20
+ address: Optional[str]
21
+
22
+
23
+ class SearchResponse(BaseModel):
24
+ query: str
25
+ results: List[FacultyResult]
cleaning/__init__.py ADDED
File without changes
cleaning/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (175 Bytes). View file
 
cleaning/__pycache__/clean_faculty_records.cpython-312.pyc ADDED
Binary file (3.36 kB). View file
 
cleaning/clean_faculty_records.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import html
3
+ from bs4 import BeautifulSoup
4
+ from storage.db import get_connection
5
+
6
+
7
+ def clean_html_field(raw_html: str) -> str | None:
8
+ """
9
+ Clean HTML-heavy fields:
10
+ biography, specialization, research, publications, teaching
11
+ """
12
+ if not raw_html:
13
+ return None
14
+
15
+ # Decode HTML entities
16
+ raw_html = html.unescape(raw_html)
17
+
18
+ soup = BeautifulSoup(raw_html, "lxml")
19
+
20
+ # Remove noisy / non-semantic tags
21
+ for tag in soup(["script", "style", "table", "sup"]):
22
+ tag.decompose()
23
+
24
+ text = soup.get_text(separator=" ")
25
+
26
+ # Normalize whitespace
27
+ text = re.sub(r"\s+", " ", text)
28
+
29
+ return text.strip() if text.strip() else None
30
+
31
+
32
+ def clean_plain_text(text: str) -> str | None:
33
+ """
34
+ Clean already-plain text fields:
35
+ education, address
36
+ """
37
+ if not text:
38
+ return None
39
+
40
+ text = html.unescape(text)
41
+ text = text.replace("\u00a0", " ")
42
+ text = re.sub(r"\s+", " ", text)
43
+
44
+ return text.strip() if text.strip() else None
45
+
46
+
47
+ def clean_all_faculty_fields():
48
+ """
49
+ Clean ALL faculty fields IN-PLACE.
50
+ No schema change. No new columns.
51
+ """
52
+ conn = get_connection()
53
+ cur = conn.cursor()
54
+
55
+ cur.execute("""
56
+ SELECT
57
+ faculty_id,
58
+ biography,
59
+ specialization,
60
+ research,
61
+ publications,
62
+ teaching,
63
+ education,
64
+ address
65
+ FROM faculty
66
+ """)
67
+
68
+ rows = cur.fetchall()
69
+
70
+ for row in rows:
71
+ cur.execute("""
72
+ UPDATE faculty
73
+ SET
74
+ biography = ?,
75
+ specialization = ?,
76
+ research = ?,
77
+ publications = ?,
78
+ teaching = ?,
79
+ education = ?,
80
+ address = ?
81
+ WHERE faculty_id = ?
82
+ """, (
83
+ clean_html_field(row["biography"]),
84
+ clean_html_field(row["specialization"]),
85
+ clean_html_field(row["research"]),
86
+ clean_html_field(row["publications"]),
87
+ clean_html_field(row["teaching"]),
88
+ clean_plain_text(row["education"]),
89
+ clean_plain_text(row["address"]),
90
+ row["faculty_id"]
91
+ ))
92
+
93
+ conn.commit()
94
+ conn.close()
95
+
96
+ print("[CLEANING] All faculty fields cleaned successfully.")
97
+
98
+
99
+ if __name__ == "__main__":
100
+ clean_all_faculty_fields()
data/processed/clean_faculty_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/raw/raw_faculty_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
ingestion/__pycache__/discover_urls.cpython-312.pyc ADDED
Binary file (2.61 kB). View file
 
ingestion/__pycache__/http_client.cpython-312.pyc ADDED
Binary file (1.89 kB). View file
 
ingestion/__pycache__/scrape_faculty.cpython-312.pyc ADDED
Binary file (3.37 kB). View file
 
ingestion/__pycache__/section_parser.cpython-312.pyc ADDED
Binary file (1.58 kB). View file
 
ingestion/__pycache__/utils.cpython-312.pyc ADDED
Binary file (382 Bytes). View file
 
ingestion/discover_urls.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from urllib.parse import urljoin, urlparse
3
+ from ingestion.http_client import get_session
4
+
5
+ BASE_URL = "https://www.daiict.ac.in"
6
+
7
+ SEED_URLS = {
8
+ "regular_faculty": "/faculty",
9
+ "adjunct_faculty": "/adjunct-faculty",
10
+ "adjunct_faculty_international": "/adjunct-faculty-international",
11
+ "distinguished_professor": "/distinguished-professor",
12
+ "professor_of_practice": "/professor-practice"
13
+ }
14
+
15
+ PROFILE_PREFIXES = [
16
+ "/faculty/",
17
+ "/adjunct-faculty/",
18
+ "/adjunct-faculty-international/",
19
+ "/distinguished-professor/",
20
+ "/professor-practice/"
21
+ ]
22
+
23
+ session = get_session()
24
+
25
+
26
+ def discover_faculty_urls():
27
+ discovered = {}
28
+
29
+ for category, path in SEED_URLS.items():
30
+ seed_url = urljoin(BASE_URL, path)
31
+ print(f"[INFO] Crawling {seed_url}")
32
+
33
+ resp = session.get(seed_url)
34
+ if resp.status_code != 200:
35
+ print(f"[WARN] Failed to fetch {seed_url}: {resp.status_code}")
36
+ continue
37
+
38
+ soup = BeautifulSoup(resp.text, "lxml")
39
+
40
+ # Changes done in it by Harsh
41
+ for a in soup.find_all("a", href=True):
42
+ raw_href = a["href"]
43
+
44
+ # Normalize URL (handles relative + absolute)
45
+ full_url = urljoin(BASE_URL, raw_href)
46
+ parsed = urlparse(full_url)
47
+
48
+ # Only accept DAIICT internal links
49
+ if parsed.netloc != "www.daiict.ac.in":
50
+ continue
51
+
52
+ # Check faculty profile path
53
+ if any(parsed.path.startswith(prefix) for prefix in PROFILE_PREFIXES):
54
+
55
+ if full_url not in discovered:
56
+ discovered[full_url] = {
57
+ "profile_url": full_url,
58
+ # CATEGORY COMES FROM SOURCE PAGE
59
+ "faculty_category": category
60
+ }
61
+
62
+ return list(discovered.values())
63
+
64
+
65
+ if __name__ == "__main__":
66
+ urls = discover_faculty_urls()
67
+ print(f"\nDiscovered {len(urls)} faculty profiles\n")
68
+ for u in urls:
69
+ print(u)
ingestion/http_client.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from requests.adapters import HTTPAdapter
3
+ from urllib3.util.retry import Retry
4
+
5
+ DEFAULT_HEADERS = {
6
+ "User-Agent": "FacultyFinderBot/1.0"
7
+ }
8
+
9
+ def get_session(
10
+ total_retries: int = 3,
11
+ backoff_factor: float = 1.0,
12
+ timeout: int = 15
13
+ ):
14
+ """
15
+ Returns a requests.Session configured with retry and exponential backoff.
16
+
17
+ Retries on:
18
+ - Connection errors
19
+ - HTTP 500, 502, 503, 504
20
+
21
+ Backoff pattern:
22
+ 1s → 2s → 4s
23
+ """
24
+
25
+ session = requests.Session()
26
+ session.headers.update(DEFAULT_HEADERS)
27
+
28
+ retries = Retry(
29
+ total=total_retries,
30
+ connect=total_retries,
31
+ read=total_retries,
32
+ backoff_factor=backoff_factor,
33
+ status_forcelist=[500, 502, 503, 504],
34
+ allowed_methods=["GET"],
35
+ raise_on_status=False
36
+ )
37
+
38
+ adapter = HTTPAdapter(max_retries=retries)
39
+ session.mount("http://", adapter)
40
+ session.mount("https://", adapter)
41
+
42
+ # Attach timeout as a session attribute (clean pattern)
43
+ session.request = _inject_timeout(session.request, timeout)
44
+
45
+ return session
46
+
47
+
48
+ def _inject_timeout(request_func, timeout):
49
+ def wrapper(*args, **kwargs):
50
+ kwargs.setdefault("timeout", timeout)
51
+ return request_func(*args, **kwargs)
52
+ return wrapper
ingestion/scrape_faculty.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ingestion.http_client import get_session
2
+ from bs4 import BeautifulSoup
3
+
4
+ BASE_URL = "https://www.daiict.ac.in"
5
+ session = get_session()
6
+
7
+ def clean_text(text):
8
+ if not text:
9
+ return None
10
+ return " ".join(text.split())
11
+
12
+ def scrape_faculty_profile(profile_url, faculty_category):
13
+ resp = session.get(profile_url)
14
+
15
+ if resp.status_code != 200:
16
+ raise RuntimeError(
17
+ f"Failed to fetch profile ({resp.status_code}): {profile_url}"
18
+ )
19
+
20
+ soup = BeautifulSoup(resp.text, "lxml")
21
+
22
+ # ---------- BASIC INFO ----------
23
+ name_tag = soup.select_one("div.field--name-field-faculty-names")
24
+ name = clean_text(name_tag.get_text()) if name_tag else None
25
+
26
+ img_tag = soup.select_one("div.field--name-field-faculty-image img")
27
+ image_url = BASE_URL + img_tag["src"] if img_tag else None
28
+
29
+ education = clean_text(
30
+ soup.select_one("div.field--name-field-faculty-name").get_text()
31
+ if soup.select_one("div.field--name-field-faculty-name")
32
+ else None
33
+ )
34
+
35
+ phone = clean_text(
36
+ soup.select_one("div.field--name-field-contact-no").get_text()
37
+ if soup.select_one("div.field--name-field-contact-no")
38
+ else None
39
+ )
40
+
41
+ address = clean_text(
42
+ soup.select_one("div.field--name-field-address").get_text()
43
+ if soup.select_one("div.field--name-field-address")
44
+ else None
45
+ )
46
+
47
+ email = clean_text(
48
+ soup.select_one("div.field--name-field-email div.field__item").get_text()
49
+ if soup.select_one("div.field--name-field-email div.field__item")
50
+ else None
51
+ )
52
+
53
+ # ---------- RAW HTML SECTIONS (WITH TAGS) ----------
54
+
55
+ biography_block = soup.select_one("div.field--name-field-biography")
56
+ biography = biography_block.decode_contents() if biography_block else None
57
+
58
+ specialization_block = soup.select_one("div.specializationIcon + div.work-exp")
59
+ specialization = specialization_block.decode_contents() if specialization_block else None
60
+
61
+ teaching_block = soup.select_one("div.field--name-field-teaching")
62
+ teaching = teaching_block.decode_contents() if teaching_block else None
63
+
64
+ research_block = soup.select_one("div.work-exp1 div.field--type-text-with-summary")
65
+ research = research_block.decode_contents() if research_block else None
66
+
67
+ publications_block = soup.select_one("div.education.overflowContent")
68
+ publications = publications_block.decode_contents() if publications_block else None
69
+
70
+ return {
71
+ "name": name,
72
+ "profile_url": profile_url,
73
+ "faculty_category": faculty_category,
74
+ "image_url": image_url,
75
+ "education": education,
76
+ "phone": phone,
77
+ "address": address,
78
+ "email": email,
79
+ "biography": biography,
80
+ "specialization": specialization,
81
+ "teaching": teaching,
82
+ "research": research,
83
+ "publications": publications
84
+ }
rag/.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ GOOGLE_API_KEY = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE"
2
+ OPENROUTER_API_KEY = "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"
3
+ OPENAI_API_KEY = "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"
4
+ GROQ_API_KEY = "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"
rag/__pycache__/step_2_authority_scoring.cpython-312.pyc ADDED
Binary file (1.78 kB). View file
 
rag/__pycache__/step_2_bm25_retrieval.cpython-312.pyc ADDED
Binary file (2.47 kB). View file
 
rag/__pycache__/step_4_semantic_retrieval.cpython-312.pyc ADDED
Binary file (2.24 kB). View file
 
rag/__pycache__/step_5_hybrid_retrieval.cpython-312.pyc ADDED
Binary file (4.11 kB). View file
 
rag/__pycache__/step_6_llm_explainability.cpython-312.pyc ADDED
Binary file (3.58 kB). View file
 
rag/__pycache__/utils.cpython-312.pyc ADDED
Binary file (894 Bytes). View file
 
rag/artifacts/bm25_index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2dbb4bf10c5e4b23566c2a67a1e3f1ce5488b7561b31eac43b0df9e944038ca
3
+ size 711889
rag/artifacts/faculty_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
rag/artifacts/faculty_evidence_units.json ADDED
The diff for this file is too large to render. See raw diff
 
rag/step_1_text_construction.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sqlite3
3
+ from pathlib import Path
4
+
5
+ BASE_DIR = Path(__file__).resolve().parents[1]
6
+ DB_PATH = BASE_DIR / "storage" / "faculty.db"
7
+ OUTPUT_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_evidence_units.json"
8
+
9
+ FIELDS = [
10
+ ("research", "research"),
11
+ ("publications", "publications"),
12
+ ("teaching", "teaching"),
13
+ ("biography", "biography"),
14
+ ("education", "education"),
15
+ ("specialization", "specialization")
16
+ ]
17
+
18
+ def fetch_faculty_data():
19
+ conn = sqlite3.connect(DB_PATH)
20
+ conn.row_factory = sqlite3.Row
21
+ cur = conn.cursor()
22
+ cur.execute("SELECT * FROM faculty")
23
+ rows = cur.fetchall()
24
+ conn.close()
25
+ return rows
26
+
27
+ def build_evidence_units():
28
+ faculty_rows = fetch_faculty_data()
29
+ evidence_units = []
30
+
31
+ for row in faculty_rows:
32
+ for field_name, column in FIELDS:
33
+ text = row[column]
34
+
35
+ if text and len(text.strip()) > 30:
36
+ evidence_units.append({
37
+ "faculty_id": row["faculty_id"],
38
+ "name": row["name"],
39
+ "faculty_category": row["faculty_category"],
40
+ "field": field_name,
41
+ "text": text.strip()
42
+ })
43
+
44
+ return evidence_units
45
+
46
+ if __name__ == "__main__":
47
+ evidence_units = build_evidence_units()
48
+
49
+ OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
50
+ with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
51
+ json.dump(evidence_units, f, indent=2)
52
+
53
+ print(f"[STEP 1 COMPLETE] Generated {len(evidence_units)} evidence units")
54
+ print(f"[OUTPUT] {OUTPUT_PATH}")
rag/step_2_bm25_retrieval.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from rank_bm25 import BM25Okapi
4
+ import re
5
+
6
+ # Config & Path
7
+ BASE_DIR = Path(__file__).resolve().parents[1]
8
+ DOCS_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_documents.json"
9
+
10
+
11
+ # Utility Functions
12
+ def tokenize(text: str):
13
+ text = text.lower()
14
+ text = re.sub(r"[^a-z0-9\s]", " ", text)
15
+ return text.split()
16
+
17
+
18
+ # Load Faculty Documents
19
+ with open(DOCS_PATH, "r", encoding="utf-8") as f:
20
+ faculty_docs = json.load(f)
21
+
22
+ corpus = [tokenize(doc["text"]) for doc in faculty_docs]
23
+
24
+ bm25 = BM25Okapi(corpus)
25
+
26
+
27
+ # Retrieving Function - BM25 Retriever
28
+ def bm25_retrieve(query: str, top_k: int = 10):
29
+ query_tokens = tokenize(query)
30
+ scores = bm25.get_scores(query_tokens)
31
+
32
+ ranked = sorted(
33
+ zip(faculty_docs, scores),
34
+ key=lambda x: x[1],
35
+ reverse=True
36
+ )
37
+
38
+ results = []
39
+ for doc, score in ranked[:top_k]:
40
+ results.append({
41
+ "faculty_id": doc["faculty_id"],
42
+ "name": doc["name"],
43
+ "faculty_category": doc["faculty_category"],
44
+ "bm25_score": round(float(score), 4)
45
+ })
46
+
47
+ return results
48
+
49
+
50
+ # Main Function
51
+ if __name__ == "__main__":
52
+ query = "Natural Language Processing"
53
+ results = bm25_retrieve(query)
54
+
55
+ print(f"\nBM25 Results for query: '{query}'\n")
56
+ for r in results:
57
+ print(
58
+ f"{r['name']} | "
59
+ f"Category: {r['faculty_category']} | "
60
+ f"Score: {r['bm25_score']}"
61
+ )
rag/step_3_semantic_index.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+
6
+ BASE_DIR = Path(__file__).resolve().parents[1]
7
+ EVIDENCE_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_evidence_units.json"
8
+ VECTOR_DIR = BASE_DIR / "rag" / "vector_store" / "chroma_evidence"
9
+
10
+ def load_evidence_units():
11
+ with open(EVIDENCE_PATH, "r", encoding="utf-8") as f:
12
+ return json.load(f)
13
+
14
+ if __name__ == "__main__":
15
+ evidence_units = load_evidence_units()
16
+
17
+ texts = [e["text"] for e in evidence_units]
18
+ metadatas = [
19
+ {
20
+ "faculty_id": e["faculty_id"],
21
+ "field": e["field"],
22
+ "name": e["name"],
23
+ "faculty_category": e["faculty_category"]
24
+ }
25
+ for e in evidence_units
26
+ ]
27
+
28
+ embeddings = HuggingFaceEmbeddings(
29
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
30
+ )
31
+
32
+ vectorstore = Chroma.from_texts(
33
+ texts=texts,
34
+ metadatas=metadatas,
35
+ embedding=embeddings,
36
+ persist_directory=str(VECTOR_DIR)
37
+ )
38
+
39
+ print(f"[STEP 3 COMPLETE] Indexed {len(texts)} evidence units")
40
+ print(f"[VECTOR STORE] {VECTOR_DIR}")
rag/step_4_semantic_retrieval.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from collections import defaultdict
3
+ from langchain_community.vectorstores import Chroma
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+
6
+ BASE_DIR = Path(__file__).resolve().parents[1]
7
+ VECTOR_DIR = BASE_DIR / "rag" / "vector_store" / "chroma_evidence"
8
+
9
+ FIELD_WEIGHTS = {
10
+ "research": 0.4,
11
+ "publications": 0.3,
12
+ "teaching": 0.2,
13
+ "biography": 0.1,
14
+ "education": 0.1
15
+ }
16
+
17
+ def semantic_retrieve(query, top_k=20):
18
+ embeddings = HuggingFaceEmbeddings(
19
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
20
+ )
21
+
22
+ vectorstore = Chroma(
23
+ persist_directory=str(VECTOR_DIR),
24
+ embedding_function=embeddings
25
+ )
26
+
27
+ results = vectorstore.similarity_search_with_score(query, k=top_k)
28
+
29
+ faculty_scores = defaultdict(float)
30
+ faculty_meta = {}
31
+
32
+ for doc, score in results:
33
+ meta = doc.metadata
34
+ field = meta["field"]
35
+ weight = FIELD_WEIGHTS.get(field, 0.1)
36
+
37
+ faculty_id = meta["faculty_id"]
38
+ faculty_scores[faculty_id] += (1 - score) * weight
39
+ faculty_meta[faculty_id] = meta
40
+
41
+ ranked = sorted(
42
+ faculty_scores.items(),
43
+ key=lambda x: x[1],
44
+ reverse=True
45
+ )
46
+
47
+ return [
48
+ {
49
+ "faculty_id": fid,
50
+ "name": faculty_meta[fid]["name"],
51
+ "faculty_category": faculty_meta[fid]["faculty_category"],
52
+ "semantic_score": round(score, 4)
53
+ }
54
+ for fid, score in ranked
55
+ ]
rag/step_5_hybrid_retrieval.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from rag.step_2_bm25_retrieval import bm25_retrieve
6
+ from rag.step_4_semantic_retrieval import semantic_retrieve
7
+ from rag.step_6_llm_explainability import explain_and_rerank
8
+ from rag.utils import load_faculty_documents
9
+
10
+
11
+ # -------------------------
12
+ # NORMALIZATION FUNCTION
13
+ # -------------------------
14
+ def normalize(scores):
15
+ min_s = min(scores)
16
+ max_s = max(scores)
17
+
18
+ if max_s == min_s:
19
+ return [1.0] * len(scores)
20
+
21
+ return [(s - min_s) / (max_s - min_s) for s in scores]
22
+
23
+
24
+ # -------------------------
25
+ # HYBRID RETRIEVAL
26
+ # -------------------------
27
+ def hybrid_retrieve(query, top_k=10, alpha=0.6):
28
+ """
29
+ alpha = weight for semantic score
30
+ (1 - alpha) = weight for BM25 score
31
+ """
32
+
33
+ bm25_results = bm25_retrieve(query, top_k=top_k * 2)
34
+ semantic_results = semantic_retrieve(query, top_k=top_k * 2)
35
+
36
+ bm25_dict = {r["faculty_id"]: r for r in bm25_results}
37
+ semantic_dict = {r["faculty_id"]: r for r in semantic_results}
38
+
39
+ faculty_ids = set(bm25_dict) | set(semantic_dict)
40
+
41
+ bm25_scores = [bm25_dict.get(fid, {}).get("bm25_score", 0.0) for fid in faculty_ids]
42
+ semantic_scores = [semantic_dict.get(fid, {}).get("semantic_score", 0.0) for fid in faculty_ids]
43
+
44
+ bm25_norm = normalize(bm25_scores)
45
+ semantic_norm = normalize(semantic_scores)
46
+
47
+ fused_results = []
48
+
49
+ for fid, b_score, s_score in zip(faculty_ids, bm25_norm, semantic_norm):
50
+ fused_score = (1 - alpha) * b_score + alpha * s_score
51
+ source = bm25_dict.get(fid) or semantic_dict.get(fid)
52
+
53
+ fused_results.append({
54
+ "faculty_id": fid,
55
+ "name": source["name"],
56
+ "faculty_category": source["faculty_category"],
57
+ "final_score": round(fused_score, 4)
58
+ })
59
+
60
+ fused_results.sort(key=lambda x: x["final_score"], reverse=True)
61
+ return fused_results[:top_k]
62
+
63
+
64
+ # -------------------------
65
+ # MAIN PIPELINE
66
+ # -------------------------
67
+ if __name__ == "__main__":
68
+ query = "Natural Language Processing"
69
+
70
+ # Step 5: Hybrid Retrieval
71
+ hybrid_results = hybrid_retrieve(query, top_k=5)
72
+
73
+ print("\n--- Hybrid Retrieval Results ---\n")
74
+ for r in hybrid_results:
75
+ print(
76
+ f"{r['name']} | "
77
+ f"{r['faculty_category']} | "
78
+ f"Score: {r['final_score']}"
79
+ )
80
+
81
+ # Load faculty documents for context
82
+ faculty_docs = load_faculty_documents()
83
+
84
+ enriched_results = []
85
+ for r in hybrid_results:
86
+ doc = faculty_docs.get(r["faculty_id"], {})
87
+ enriched_results.append({**r, **doc})
88
+
89
+ # Step 6: LLM Reranking + Explainability
90
+ print("\n--- LLM Reranked & Explained Results ---\n")
91
+ llm_results = explain_and_rerank(query, enriched_results)
92
+
93
+ for r in llm_results:
94
+ print(f"Rank {r['rank']}: {r['name']} ({r['category']})")
95
+ print(f"Reason: {r['reason']}")
96
+ print("-" * 60)
rag/step_6_llm_explainability.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_genai import ChatGoogleGenerativeAI
2
+ from langchain_core.prompts import PromptTemplate
3
+ import json
4
+ import re
5
+
6
+ llm = ChatGoogleGenerativeAI(
7
+ api_key = "AIzaSyDVL9AgS863gz5C78-Hy9PgFUImpSB3VTE",
8
+ model="gemini-2.5-flash-lite",
9
+ temperature=0.2
10
+ )
11
+
12
+ PROMPT = PromptTemplate(
13
+ input_variables=["query", "candidates"],
14
+ template="""
15
+ You are an academic mentor advising a university student.
16
+
17
+ Student Query:
18
+ "{query}"
19
+
20
+ Below is a list of faculty candidates.
21
+ Each candidate has a UNIQUE faculty_id.
22
+
23
+ Candidates:
24
+ {candidates}
25
+
26
+ Your task:
27
+ 1. Rank ALL faculty from best to worst.
28
+ 2. For EACH faculty, explain:
29
+ - Alignment with the student's interest
30
+ - What the student gains
31
+ - Any limitations (adjunct role, availability, etc.)
32
+ 3. Use a student-friendly advisory tone.
33
+ 4. DO NOT mention scores.
34
+ 5. RETURN STRICT JSON ONLY.
35
+ 6. DO NOT invent or change faculty_id.
36
+
37
+ Required JSON format:
38
+ [
39
+ {{
40
+ "rank": 1,
41
+ "faculty_id": 48,
42
+ "reason": "Student-focused explanation (3–4 lines)"
43
+ }}
44
+ ]
45
+ """
46
+ )
47
+
48
+ # ---------- JSON SAFE PARSER ----------
49
+ def _extract_json(text: str):
50
+ try:
51
+ text = text.replace("```json", "").replace("```", "").strip()
52
+ start = text.find("[")
53
+ end = text.rfind("]") + 1
54
+ if start == -1 or end == -1:
55
+ return None
56
+ return json.loads(text[start:end])
57
+ except Exception:
58
+ return None
59
+
60
+
61
+ # ---------- MAIN ENTRY ----------
62
+ def explain_and_rerank(query, hybrid_results):
63
+
64
+ # Build ID-anchored candidate list
65
+ candidates_text = "\n".join(
66
+ f"- faculty_id:{r['faculty_id']} | {r['name']} ({r['faculty_category']})"
67
+ for r in hybrid_results
68
+ )
69
+
70
+ prompt = PROMPT.format(
71
+ query=query,
72
+ candidates=candidates_text
73
+ )
74
+
75
+ response = llm.invoke(prompt)
76
+ raw_output = response.content.strip()
77
+
78
+ parsed = _extract_json(raw_output)
79
+
80
+ # 🔒 HARD FALLBACK (never break API)
81
+ if parsed is None:
82
+ return [
83
+ {
84
+ "rank": idx + 1,
85
+ "faculty_id": r["faculty_id"],
86
+ "name": r["name"],
87
+ "category": r["faculty_category"],
88
+ "reason": "AI explanation unavailable. Ranked based on hybrid relevance score."
89
+ }
90
+ for idx, r in enumerate(hybrid_results)
91
+ ]
92
+
93
+ # Build lookup tables
94
+ faculty_map = {
95
+ r["faculty_id"]: r
96
+ for r in hybrid_results
97
+ }
98
+
99
+ final = []
100
+ for item in parsed:
101
+ faculty_id = item.get("faculty_id")
102
+
103
+ if faculty_id not in faculty_map:
104
+ continue
105
+
106
+ faculty = faculty_map[faculty_id]
107
+
108
+ final.append({
109
+ "rank": item["rank"],
110
+ "faculty_id": faculty_id,
111
+ "name": faculty["name"],
112
+ "category": faculty["faculty_category"],
113
+ "reason": item["reason"]
114
+ })
115
+
116
+ return final
rag/utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ BASE_DIR = Path(__file__).resolve().parents[1]
5
+ DOCS_PATH = BASE_DIR / "rag" / "artifacts" / "faculty_documents.json"
6
+
7
+ def load_faculty_documents():
8
+ with open(DOCS_PATH, "r", encoding="utf-8") as f:
9
+ docs = json.load(f)
10
+
11
+ return {d["faculty_id"]: d for d in docs}
rag/vector_store/chroma_evidence/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed0f499b5545c1322806c433823e52d2c96f01af67e2c37e7cc2026796818d3e
3
+ size 7766016
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de47b1f77fc53b981dc21ddc4bac47721fad56a124537ac8bf6090a80ed778f
3
+ size 167600
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc151afd6f6ddb991c87835b97ff475e8e67b77ededde812d976ed2ad93e848
3
+ size 400
rag/vector_store/chroma_evidence/d0af11f5-f41b-495e-90e8-e60f3fa9bd34/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-dotenv
4
+ pydantic
5
+ rank-bm25
6
+ langchain
7
+ langchain-community
8
+ langchain-huggingface
9
+ langchain-google-genai
10
+ chromadb
11
+ sentence-transformers
storage/__pycache__/db.cpython-312.pyc ADDED
Binary file (672 Bytes). View file
 
storage/__pycache__/db.cpython-313.pyc ADDED
Binary file (674 Bytes). View file
 
storage/__pycache__/fetch_faculty.cpython-312.pyc ADDED
Binary file (1.22 kB). View file
 
storage/__pycache__/insert_faculty.cpython-312.pyc ADDED
Binary file (1.57 kB). View file
 
storage/db.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from pathlib import Path
3
+
4
+ DB_PATH = Path(__file__).resolve().parents[1] / "storage" / "faculty.db"
5
+
6
+ def get_connection():
7
+ conn = sqlite3.connect(DB_PATH)
8
+ conn.row_factory = sqlite3.Row
9
+ return conn