Spaces:

tejashsr
/

titanium-logic-engine

Sleeping

App Files Files Community

tejashsr commited on Jan 12

Commit

f63de94

verified ·

1 Parent(s): e39f0e1

Update model.py

Browse files

Files changed (1) hide show

model.py +55 -77

model.py CHANGED Viewed

@@ -1,4 +1,5 @@
-import os, re, sys, subprocess
 import pandas as pd
 import numpy as np
 import spacy
@@ -7,41 +8,34 @@ from flashrank import Ranker, RerankRequest
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
 from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.ensemble import RandomForestClassifier
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# ============================
-# 1. LAZY LOADING GLOBALS
-# ============================
 nlp = None
 retriever = None
 ranker = None
 tokenizer = None
 nli_model = None
-kb = None
-clf = None
-def load_engines_if_needed():
-    global nlp, retriever, ranker, tokenizer, nli_model, kb, clf
-    if nlp is None:
-        print("⏳ Lazy Loading: Starting Engines...")
-        try: nlp = spacy.load("en_core_web_sm", disable=["parser"])
-        except:
-            subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
-            nlp = spacy.load("en_core_web_sm", disable=["parser"])
-        retriever = SentenceTransformer('all-MiniLM-L6-v2')
-        ranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/opt")
-        tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-base")
-        nli_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/nli-deberta-v3-base")
-        kb = UniversalGraphKB()
-        print("✅ Engines Ready")
-# ============================
-# 2. UNIVERSAL KNOWLEDGE GRAPH
-# ============================
 class UniversalGraphKB:
     def __init__(self):
         self.indices = {}
@@ -50,14 +44,16 @@ class UniversalGraphKB:
     def get_chunks(self, text):
         words = re.findall(r'\S+', text)
         chunks = []
-        step = 400
         for i in range(0, len(words), step):
             chunk = " ".join(words[i:i + 500])
             if len(chunk) > 50: chunks.append(chunk)
         return chunks
-    def ingest_book(self, text, key="session_book"):
         chunks = self.get_chunks(text)
         doc = nlp(text[:100000])
         names = [ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"]
         main_char = pd.Series(names).value_counts().index[0] if names else "Unknown"
@@ -70,11 +66,11 @@ class UniversalGraphKB:
         }
         return main_char.title()
-# ============================
-# 3. TITANIUM LOGIC (With Word-to-Number Patch)
-# ============================
 def normalize_dates(text):
-    """Converts text centuries to approximate years for the Regex to catch."""
     text = text.lower()
     mapping = {
         "eighteenth": "1750", "18th": "1750",
@@ -83,8 +79,7 @@ def normalize_dates(text):
         "twenty-first": "2050", "21st": "2050"
     }
     for word, year in mapping.items():
-        if word in text:
-            text += f" ({year}) " # Append the digit so Regex sees it
     return text
 def get_nli_score(premise, hypothesis):
@@ -92,78 +87,61 @@ def get_nli_score(premise, hypothesis):
     with torch.no_grad():
         outputs = nli_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
-    # DeBERTa-v3-base NLI Output: [Contradiction, Entailment, Neutral] (usually)
-    # We return the Entailment score (Index 1) minus Contradiction (Index 0)
-    # Higher = More Consistent
-    return float(probs[1])
-def extract_features(backstory, book_key="session_book"):
-    if kb is None or book_key not in kb.indices: return [0,0], "", None, "Book not ingested."
-    idx = kb.indices[book_key]
-    protagonist = kb.protagonists.get(book_key, "")
-    # Pre-process backstory for dates
     backstory_norm = normalize_dates(backstory)
     aug_query = f"{backstory} (Context: {protagonist})"
     q_vec = retriever.encode(aug_query)
     v_scores = cosine_similarity([q_vec], idx['vectors'])[0]
     candidates = list(v_scores.argsort()[-30:][::-1])
     passages = [{"id": i, "text": idx['text'][i]} for i in candidates]
-    rerank_req = RerankRequest(query=backstory, passages=passages)
-    results = ranker.rerank(rerank_req)
     best_chunk = results[0]['text']
-    # NORMALIZE CHUNK FOR DATES TOO
     best_chunk_norm = normalize_dates(best_chunk)
-    # --- LOGIC GUARDRAILS ---
     if backstory.strip() in best_chunk:
         return [1.0, 0], best_chunk, 1, "VERIFIED: Exact Text Match"
     YEAR_PATTERN = r'\b([1-2][0-9]{3})\b'
     q_years = [int(y) for y in re.findall(YEAR_PATTERN, backstory_norm)]
     e_years = [int(y) for y in re.findall(YEAR_PATTERN, best_chunk_norm)]
     if q_years and e_years:
-        # If gap > 5 years -> Contradiction
         if not any(abs(by - ey) < 5 for by in q_years for ey in e_years):
             return [0.0, 1], best_chunk, 0, f"CRITICAL: Timeline Mismatch ({q_years[0]} vs {e_years[0]})"
     score = get_nli_score(aug_query, best_chunk)
     return [score, 0], best_chunk, None, ""
-# ============================
-# 4. PUBLIC WRAPPER
-# ============================
-def predict_for_website(backstory, book_text=None):
-    load_engines_if_needed()
-    if book_text:
-        kb.ingest_book(book_text, "session_book")
-    feats, ev, verdict, rat = extract_features(backstory, "session_book")
-    # 1. Guardrail Verdict (Math/Exact)
     if verdict is not None:
-        return {
-            "prediction": "Consistent" if verdict==1 else "Contradiction",
-            "confidence": 1.0,
-            "rationale": rat,
-            "evidence": ev[:300] + "...",
-            "protagonist": kb.protagonists.get("session_book", "Unknown")
-        }
-    # 2. Neural Verdict (NLI Score)
-    # Threshold 0.5: If Entailment > 0.5, it's consistent.
-    pred = 1 if feats[0] > 0.2 else 0
     return {
-        "prediction": "Consistent" if pred==1 else "Contradiction",
-        "confidence": round(feats[0], 2),
-        "rationale": f"Semantic Consistency Score: {feats[0]:.2f}",
-        "evidence": ev[:300] + "...",
-        "protagonist": kb.protagonists.get("session_book", "Unknown")
     }

+import re
+import sys
 import pandas as pd
 import numpy as np
 import spacy
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# --- GLOBAL ENGINES (LAZY LOAD) ---
 nlp = None
 retriever = None
 ranker = None
 tokenizer = None
 nli_model = None
+def load_engines():
+    global nlp, retriever, ranker, tokenizer, nli_model
+    if nlp is not None: return
+    print("⚡ TITANIUM: Waking up Neural Engines...")
+    nlp = spacy.load("en_core_web_sm", disable=["parser"])
+    # 1. Retrieval Engine
+    retriever = SentenceTransformer('all-MiniLM-L6-v2')
+    # 2. Rerank Engine
+    ranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/app/cache")
+    # 3. Logic Engine (DeBERTa-v3)
+    tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-base")
+    nli_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/nli-deberta-v3-base")
+    print("✅ TITANIUM: Ready.")
+# --- UNIVERSAL KNOWLEDGE GRAPH ---
 class UniversalGraphKB:
     def __init__(self):
         self.indices = {}
     def get_chunks(self, text):
         words = re.findall(r'\S+', text)
         chunks = []
+        step = 400
         for i in range(0, len(words), step):
             chunk = " ".join(words[i:i + 500])
             if len(chunk) > 50: chunks.append(chunk)
         return chunks
+    def ingest_book(self, text, key="session"):
         chunks = self.get_chunks(text)
+        # Auto-Protagonist Detection
         doc = nlp(text[:100000])
         names = [ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"]
         main_char = pd.Series(names).value_counts().index[0] if names else "Unknown"
         }
         return main_char.title()
+kb = UniversalGraphKB()
+# --- TITANIUM LOGIC GUARDRAILS ---
 def normalize_dates(text):
+    """Visual Confirmation: Turns words to numbers for the Logic Engine."""
     text = text.lower()
     mapping = {
         "eighteenth": "1750", "18th": "1750",
         "twenty-first": "2050", "21st": "2050"
     }
     for word, year in mapping.items():
+        if word in text: text += f" ({year}) "
     return text
 def get_nli_score(premise, hypothesis):
     with torch.no_grad():
         outputs = nli_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
+    return float(probs[1]) # Entailment
+def extract_features(backstory, key="session"):
+    if key not in kb.indices: return [0,0], "", None, "Book not uploaded"
+    idx = kb.indices[key]
+    protagonist = kb.protagonists.get(key, "")
+    # 1. Normalize
     backstory_norm = normalize_dates(backstory)
     aug_query = f"{backstory} (Context: {protagonist})"
+    # 2. Search & Rerank
     q_vec = retriever.encode(aug_query)
     v_scores = cosine_similarity([q_vec], idx['vectors'])[0]
     candidates = list(v_scores.argsort()[-30:][::-1])
     passages = [{"id": i, "text": idx['text'][i]} for i in candidates]
+    results = ranker.rerank(RerankRequest(query=backstory, passages=passages))
     best_chunk = results[0]['text']
     best_chunk_norm = normalize_dates(best_chunk)
+    # --- GUARDRAILS (The "Visual Confirmation") ---
+    # A. Exact Match
     if backstory.strip() in best_chunk:
         return [1.0, 0], best_chunk, 1, "VERIFIED: Exact Text Match"
+    # B. Math Timeline Guardrail
     YEAR_PATTERN = r'\b([1-2][0-9]{3})\b'
     q_years = [int(y) for y in re.findall(YEAR_PATTERN, backstory_norm)]
     e_years = [int(y) for y in re.findall(YEAR_PATTERN, best_chunk_norm)]
     if q_years and e_years:
         if not any(abs(by - ey) < 5 for by in q_years for ey in e_years):
             return [0.0, 1], best_chunk, 0, f"CRITICAL: Timeline Mismatch ({q_years[0]} vs {e_years[0]})"
+    # C. Neural Semantic Check
     score = get_nli_score(aug_query, best_chunk)
     return [score, 0], best_chunk, None, ""
+# --- API WRAPPER ---
+def predict_logic(book_text, backstory):
+    load_engines()
+    kb.ingest_book(book_text, "session")
+    feats, ev, verdict, rat = extract_features(backstory, "session")
+    # Guardrail Triggered
     if verdict is not None:
+        return {"prediction": "Consistent" if verdict==1 else "Contradiction", "rationale": rat, "evidence": ev[:350] + "...", "score": 1.0 if verdict==1 else 0.0}
+    # Neural Decision (Threshold 0.15)
+    pred = 1 if feats[0] > 0.15 else 0
     return {
+        "prediction": "Consistent" if pred==1 else "Contradiction",
+        "rationale": f"Semantic Consistency Score: {feats[0]:.2f}",
+        "evidence": ev[:350] + "...",
+        "score": round(feats[0], 2)
     }