hsg_eaa_overhaul / src /rag /programme_facts.py
Pygmales
updated repo status
1c8f647
import re
from dataclasses import dataclass, field
from typing import Callable
@dataclass
class ProgrammeFacts:
programme: str
source_available: bool = False
focus_points: list[str] = field(default_factory=list)
fit_points: list[str] = field(default_factory=list)
timing_points: list[str] = field(default_factory=list)
document_points: list[str] = field(default_factory=list)
raw_context: str = ""
class ProgrammeFactsProvider:
"""Extract lightweight programme facts from retrieved RAG context.
This keeps volatile programme data in the scraped/imported knowledge base
instead of hardcoding it in the conversation-routing layer.
"""
_PROGRAM_FILTERS = {
"emba": "emba",
"iemba": "iemba",
"emba_x": "emba x",
}
_QUERY_BY_LANGUAGE = {
"de": (
"Bewerbung Zulassung Voraussetzungen Studiengebühr Start Datum Dauer "
"Bewerbungsfrist Unterlagen Dokumente CV Zeugnisse Führungserfahrung "
"Berufserfahrung Sprache Module Präsenzwochen Wahlkurse Capstone"
),
"en": (
"application admissions requirements tuition start date duration deadline "
"documents CV certificates transcripts leadership experience professional "
"experience language modules campus weeks electives capstone"
),
}
_FOCUS_TERMS = (
"focus",
"ziel",
"ziele",
"fokus",
"management",
"leadership",
"transformation",
"innovation",
"international",
"dach",
)
_FIT_TERMS = (
"requirement",
"requirements",
"admission",
"admissions",
"zulassung",
"voraussetzung",
"degree",
"abschluss",
"experience",
"erfahrung",
"leadership",
"führung",
"fuehrung",
"english",
"englisch",
"german",
"deutsch",
)
_TIMING_TERMS = (
"tuition",
"fee",
"fees",
"studiengebühr",
"studiengebuehr",
"chf",
"start",
"duration",
"dauer",
"months",
"monate",
"deadline",
"bewerbungsfrist",
"core course",
"kernkurs",
"elective",
"wahlkurs",
"campus week",
"präsenzwoche",
"praesenzwoche",
"abroad",
"auslandsmodul",
"capstone",
)
_DOCUMENT_TERMS = (
"document",
"documents",
"unterlagen",
"dokument",
"dokumente",
"cv",
"resume",
"zeugnis",
"zeugnisse",
"certificate",
"certificates",
"transcript",
"online-bewerbung",
"online application",
"online-assessment",
"online assessment",
"application file",
"bewerbungsakte",
)
_NOISE_TERMS = (
"vielen dank für ihr interesse",
"vielen dank fuer ihr interesse",
"senior recruitment",
"admissions manager",
"bei allgemeinen anfragen",
"allgemeinen anfragen",
"kontakt cyra",
"kontakt kristin",
"kontakt teyuna",
"impact story",
"alumnus",
"alumni",
"wir sprachen mit",
"du warst teilnehmer",
"für mich war die",
"fuer mich war die",
"unterlagen und werkzeuge",
"jeder kurswoche",
"lernerfahrungen",
"diplomarbeit",
"preis ausgezeichnet",
"ich bin mir ganz sicher",
"beruflichen fortschritt",
"tools, dem netzwerk",
"hsg mitnehmen",
)
def __init__(self, retrieve_context: Callable[[str, str, str], str]) -> None:
self._retrieve_context = retrieve_context
self._cache: dict[tuple[str, str], ProgrammeFacts] = {}
def get_facts(self, programme: str, language: str) -> ProgrammeFacts:
normalized_programme = self._normalize_programme(programme)
normalized_language = language if language in {"de", "en"} else "en"
cache_key = (normalized_programme, normalized_language)
if cache_key in self._cache:
return self._cache[cache_key]
query = self._QUERY_BY_LANGUAGE[normalized_language]
program_filter = self._PROGRAM_FILTERS.get(normalized_programme, normalized_programme)
try:
context = self._retrieve_context(query, program_filter, normalized_language) or ""
except Exception:
context = ""
facts = self._extract_facts(normalized_programme, context)
if facts.source_available:
self._cache[cache_key] = facts
return facts
def _extract_facts(self, programme: str, context: str) -> ProgrammeFacts:
sentences = self._split_sentences(context)
return ProgrammeFacts(
programme=programme,
source_available=bool(sentences),
focus_points=self._select_sentences(sentences, self._FOCUS_TERMS, limit=2),
fit_points=self._select_sentences(sentences, self._FIT_TERMS, limit=3),
timing_points=self._select_sentences(sentences, self._TIMING_TERMS, limit=4),
document_points=self._select_sentences(sentences, self._DOCUMENT_TERMS, limit=3),
raw_context=context,
)
@staticmethod
def _normalize_programme(programme: str) -> str:
normalized = (programme or "").lower().replace("-", "_").replace(" ", "_")
if normalized in {"emba_x", "embax"}:
return "emba_x"
if normalized in {"iemba", "iemba_hsg", "international_emba"}:
return "iemba"
return "emba" if normalized in {"emba", "emba_hsg"} else normalized
@staticmethod
def _split_sentences(text: str) -> list[str]:
raw_text = (text or "").strip()
if not raw_text:
return []
raw_text = re.sub(r"#{1,6}\s*", "\n", raw_text)
raw_text = re.sub(r"\|", "\n", raw_text)
chunks = re.split(r"\n+|(?<=[.!?])\s+|(?:\s+•\s+)", raw_text)
sentences = []
for chunk in chunks:
normalized = re.sub(r"\s+", " ", chunk).strip(" -•\t\n")
normalized_lower = normalized.lower()
has_compact_fact = bool(re.search(r"\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b|chf\s*[\d']", normalized_lower))
if len(normalized) < 20 and not has_compact_fact:
continue
if any(term in normalized_lower for term in ProgrammeFactsProvider._NOISE_TERMS):
continue
if len(normalized) > 320:
normalized = normalized[:317].rstrip() + "..."
sentences.append(normalized)
return sentences
@staticmethod
def _select_sentences(sentences: list[str], terms: tuple[str, ...], limit: int) -> list[str]:
selected = []
seen = set()
for sentence in sentences:
sentence_lower = sentence.lower()
if sentence_lower in seen:
continue
if any(term in sentence_lower for term in terms):
selected.append(sentence)
seen.add(sentence_lower)
if len(selected) >= limit:
break
return selected