Spaces:
Sleeping
Sleeping
File size: 5,136 Bytes
3387562 f63de94 e2aa18d d27a216 e2aa18d b3c41c2 e2aa18d b3c41c2 a594ee2 3387562 a594ee2 b3c41c2 a594ee2 3387562 b3c41c2 a594ee2 b3c41c2 a594ee2 b3c41c2 a594ee2 b3c41c2 a594ee2 b3c41c2 a594ee2 b3c41c2 e2aa18d b3c41c2 e2aa18d 3387562 e2aa18d 3387562 e2aa18d f63de94 e2aa18d 3387562 b3c41c2 e2aa18d 3387562 b3c41c2 3387562 e2aa18d 3387562 b3c41c2 e2aa18d a594ee2 e2aa18d 3387562 e2aa18d f63de94 b3c41c2 4ccc715 b3c41c2 4ccc715 3387562 4ccc715 3387562 4ccc715 f63de94 b3c41c2 f63de94 3387562 e2aa18d b3c41c2 3387562 a594ee2 e2aa18d b3c41c2 3387562 4ccc715 b3c41c2 3387562 e2aa18d 3387562 b3c41c2 3387562 b3c41c2 3387562 d27a216 b3c41c2 3387562 a594ee2 b3c41c2 a594ee2 3387562 b3c41c2 e2aa18d b3c41c2 3387562 b3c41c2 d27a216 a594ee2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import re
import sys
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from flashrank import Ranker, RerankRequest
# --- GLOBAL ENGINES ---
nlp = None
retriever = None
ranker = None
nli_model = None
def load_engines():
"""Lazy loads models to ensure the Space starts without timing out."""
global nlp, retriever, ranker, nli_model
if nlp is not None: return
print("⚡ Awakening Titanium Brain...")
# 1. NLP Core (Spacy)
import spacy
try:
nlp = spacy.load("en_core_web_sm", disable=["parser"])
except:
import subprocess
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm", "--quiet"])
nlp = spacy.load("en_core_web_sm", disable=["parser"])
# 2. Vector Search (MiniLM)
device = "cuda" if torch.cuda.is_available() else "cpu"
retriever = SentenceTransformer('all-MiniLM-L6-v2', device=device)
# 3. Precision Reranker (FlashRank)
ranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/app/cache")
# 4. Logic Core (DeBERTa Cross-Encoder)
# The CrossEncoder handles (Premise, Hypothesis) logic with high accuracy.
nli_model = CrossEncoder('cross-encoder/nli-deberta-v3-base', device=device)
print(f"✅ Engines Ready on {device.upper()}")
# --- UNIVERSAL GRAPH KNOWLEDGE BASE ---
class UniversalGraphKB:
def __init__(self):
self.indices = {}
self.context_graph = {}
def get_chunks(self, text):
words = re.findall(r'\S+', text)
return [" ".join(words[i:i+400]) for i in range(0, len(words), 350) if len(" ".join(words[i:i+400])) > 50]
def ingest_book(self, text, key="session"):
chunks = self.get_chunks(text)
# Entity Graph Extraction
doc = nlp(text[:100000])
people = [ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"]
locs = [ent.text.lower() for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
main_person = pd.Series(people).value_counts().index[0] if people else "Unknown"
main_loc = pd.Series(locs).value_counts().index[0] if locs else "Unknown"
self.context_graph[key] = f"{main_person.title()} | {main_loc.title()}"
# Vector + BM25 Hybrid Indexing
self.indices[key] = {
"text": chunks,
"vectors": retriever.encode(chunks, show_progress_bar=False),
"bm25": BM25Okapi([re.findall(r'\w+', c.lower()) for c in chunks])
}
return self.context_graph[key]
kb = UniversalGraphKB()
# --- TITANIUM LOGIC ENGINE ---
def normalize_dates(text):
"""Symbolic Layer: Translates words to years for mathematical comparison."""
text = text.lower()
mapping = {
"eighteenth": "1750", "18th": "1750", "nineteenth": "1850", "19th": "1850",
"twentieth": "1950", "20th": "1950", "twenty-first": "2050", "21st": "2050"
}
for k, v in mapping.items():
if k in text: text += f" ({v}) "
return text
def extract_features(backstory, key="session"):
if key not in kb.indices: return 0.0, "No Data", "Ingestion Failed"
idx = kb.indices[key]
context = kb.context_graph.get(key, "")
# 1. Hybrid Retrieval
aug_query = f"{backstory} (Context: {context})"
q_vec = retriever.encode(aug_query)
v_scores = cosine_similarity([q_vec], idx['vectors'])[0]
# 2. FlashRank Reranking
candidates = [{"id": i, "text": idx['text'][i]} for i in v_scores.argsort()[-15:][::-1]]
reranked = ranker.rerank(RerankRequest(query=backstory, passages=candidates))
best_chunk = reranked[0]['text']
# 3. TITANIUM GUARDRAIL: Timeline Analysis
norm_claim = normalize_dates(backstory)
norm_ev = normalize_dates(best_chunk)
years_c = [int(y) for y in re.findall(r'\b([1-2][0-9]{3})\b', norm_claim)]
years_e = [int(y) for y in re.findall(r'\b([1-2][0-9]{3})\b', norm_ev)]
if years_c and years_e:
if not any(abs(yc - ye) < 5 for yc in years_c for ye in years_e):
return 0.0, best_chunk, f"TIMELINE DISCREPANCY: {years_c[0]} vs {years_e[0]}"
# 4. NEURAL BRAIN: Semantic Entailment
# Order: (Evidence, Claim) -> Cross-Encoder calculates if Truth proves Claim
scores = nli_model.predict([(best_chunk, aug_query)])[0]
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
# Return Entailment Score (Index 1)
return float(probs[1]), best_chunk, "Neural Semantic Verification"
def predict_logic(book_text, backstory):
"""Main Entry point for the API."""
load_engines()
kb.ingest_book(book_text, "session")
score, ev, reason = extract_features(backstory, "session")
# Final Verdict Threshold
prediction = "Consistent" if score > 0.5 else "Contradiction"
return {
"prediction": prediction,
"rationale": reason,
"evidence": ev[:400] + "...",
"score": round(score, 2)
} |