Spaces:

kundan621
/

assignment

Sleeping

App Files Files Community

kundan621 commited on Aug 24, 2025

Commit

41f1bec

1 Parent(s): 03b3405

working guardrail

Browse files

Files changed (1) hide show

src/search_final.py +13 -27

src/search_final.py CHANGED Viewed

@@ -113,20 +113,6 @@ FAISS_PATH = os.path.join(OUT_DIR, "faiss_merged.index")
 BM25_PATH  = os.path.join(OUT_DIR, "bm25_merged.pkl")
 META_PATH  = os.path.join(OUT_DIR, "meta_merged.pkl")
-BLOCKED_TERMS = ["weather","cricket","movie","song","football","holiday",
-                 "travel","recipe","music","game","sports","politics","election"]
-FINANCE_DOMAINS = [
-    "financial reporting","balance sheet","income statement","assets and liabilities",
-    "equity","revenue","profit and loss","goodwill impairment","cash flow","dividends",
-    "taxation","investment","valuation","capital structure","ownership interests",
-    "subsidiaries","shareholders equity","expenses","earnings","debt","amortization","depreciation"
-]
-ALLOWED_COMPANY = ["make my trip","mmt"]
-# crude regex to detect "company-like" words (any capitalized word(s) followed by Ltd, Inc, Company, etc.)
-COMPANY_PATTERN = re.compile(r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s+(?:Ltd|Limited|Inc|Corporation|Corp|LLC|Group|Company|Bank))\b", re.IGNORECASE)
 # ---------------- Load Indexes ----------------
 logger.info("Loading FAISS, BM25, metadata, and models...")
@@ -184,29 +170,28 @@ def get_mistral_answer(query: str, context: str) -> str:
         return f"Error fetching answer from LLM: {e}"
 # ---------------- Guardrails ----------------
 finance_embeds = embed_model.encode(FINANCE_DOMAINS, convert_to_tensor=True)
 def validate_query(query: str, threshold: float = 0.5) -> bool:
     q_lower = query.lower()
-    # Blocklist check
     if any(bad in q_lower for bad in BLOCKED_TERMS):
         print("[Guardrail] Rejected by blocklist.")
         return False
-    # Check for company mentions
-    companies_found = COMPANY_PATTERN.findall(query)
-    if companies_found:
-        # If any company is mentioned, only allow MakeMyTrip
-        if not any(ALLOWED_COMPANY in c.lower() for c in companies_found):
-            print(f"[Guardrail] Rejected: company mention {companies_found}, not {ALLOWED_COMPANY}.")
-            return False
-    # Semantic similarity check with financial domain
     q_emb = embed_model.encode(query, convert_to_tensor=True)
     sim_scores = util.cos_sim(q_emb, finance_embeds)
     max_score = float(sim_scores.max())
     if max_score > threshold:
         print(f"[Guardrail] Accepted (semantic match {max_score:.2f})")
         return True
@@ -214,6 +199,7 @@ def validate_query(query: str, threshold: float = 0.5) -> bool:
         print(f"[Guardrail] Rejected (low semantic score {max_score:.2f})")
         return False
 #-------------------Output Guardrail------------------
 def validate_output(answer: str, context_docs: List[Dict]) -> str:
     combined_context = " ".join([doc["content"].lower() for doc in context_docs])

 BM25_PATH  = os.path.join(OUT_DIR, "bm25_merged.pkl")
 META_PATH  = os.path.join(OUT_DIR, "meta_merged.pkl")
 # ---------------- Load Indexes ----------------
 logger.info("Loading FAISS, BM25, metadata, and models...")
         return f"Error fetching answer from LLM: {e}"
 # ---------------- Guardrails ----------------
+# ---------------- Guardrails ----------------
+BLOCKED_TERMS = ["weather", "cricket", "movie", "song", "football", "holiday",
+                 "travel", "recipe", "music", "game", "sports", "politics", "election"]
+FINANCE_DOMAINS = [
+    "financial reporting", "balance sheet", "income statement",
+    "assets and liabilities", "equity", "revenue", "profit and loss",
+    "goodwill impairment", "cash flow", "dividends", "taxation",
+    "investment", "valuation", "capital structure", "ownership interests",
+    "subsidiaries", "shareholders equity", "expenses", "earnings",
+    "debt", "amortization", "depreciation"
+]
 finance_embeds = embed_model.encode(FINANCE_DOMAINS, convert_to_tensor=True)
 def validate_query(query: str, threshold: float = 0.5) -> bool:
     q_lower = query.lower()
     if any(bad in q_lower for bad in BLOCKED_TERMS):
         print("[Guardrail] Rejected by blocklist.")
         return False
     q_emb = embed_model.encode(query, convert_to_tensor=True)
     sim_scores = util.cos_sim(q_emb, finance_embeds)
     max_score = float(sim_scores.max())
     if max_score > threshold:
         print(f"[Guardrail] Accepted (semantic match {max_score:.2f})")
         return True
         print(f"[Guardrail] Rejected (low semantic score {max_score:.2f})")
         return False
 #-------------------Output Guardrail------------------
 def validate_output(answer: str, context_docs: List[Dict]) -> str:
     combined_context = " ".join([doc["content"].lower() for doc in context_docs])