Spaces:

Syamchand
/

red_ml-models

Running

App Files Files Community

Syamchand commited on 7 days ago

Commit

cb0602a

verified ·

1 Parent(s): fa0788d

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -73

app.py CHANGED Viewed

@@ -1,23 +1,97 @@
 import torch
-from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-import torch
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
 from sentence_transformers import SentenceTransformer
 from setfit import SetFitModel
-import numpy as np
-from typing import List
-from sentence_transformers import SentenceTransformer
-from setfit import SetFitModel
-import numpy as np
 from typing import List
 models = {}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     print("Loading models...")
@@ -29,7 +103,7 @@ async def lifespan(app: FastAPI):
     )
     print("✓ contracts_clauses loaded")
-    # 2. Contract NLI (BERT-base — AutoTokenizer works fine)
     print("Loading contract NLI model...")
     models["nli_tokenizer"] = AutoTokenizer.from_pretrained("Syamchand/contract-nli-bert")
     models["nli_model"] = AutoModelForSequenceClassification.from_pretrained(
@@ -41,16 +115,7 @@ async def lifespan(app: FastAPI):
     # 3. Clause risk classifier
     print("Loading clause risk classifier...")
-    try:
-        # After editing tokenizer_config.json this will work directly
-        models["risk_tokenizer"] = AutoTokenizer.from_pretrained(
-            "Syamchand/clause_risk_classifier"
-        )
-    except ValueError:
-        # Fallback: direct class if config still has old tokenizer_class
-        models["risk_tokenizer"] = ModernBertTokenizerFast.from_pretrained(
-            "Syamchand/clause_risk_classifier"
-        )
     models["risk_model"] = AutoModelForSequenceClassification.from_pretrained(
         "Syamchand/clause_risk_classifier"
     )
@@ -61,29 +126,24 @@ async def lifespan(app: FastAPI):
     # 4. Legal BERT embeddings
     print("Loading legal BERT embeddings model...")
     models["emb_tokenizer"] = AutoTokenizer.from_pretrained("nlpaueb/bert-base-uncased-contracts")
-    models["emb_model"] = AutoModel.from_pretrained(
-        "nlpaueb/bert-base-uncased-contracts"
-    )
     models["emb_model"].eval()
     print("✓ legal BERT loaded")
-    # 5. Semantic chunker
     print("Loading semantic chunker model...")
-    models["chunker"] = SentenceTransformer(
-        "Raubachm/sentence-transformers-semantic-chunker",
-        device="cpu"
-    )
     print("✓ semantic chunker loaded")
     print("All models ready!")
     yield
     models.clear()
-    torch.cuda.empty_cache()
 app = FastAPI(lifespan=lifespan)
 # ---------- Schemas ----------
 class TextRequest(BaseModel):
     text: str
@@ -97,8 +157,9 @@ class EmbeddingRequest(BaseModel):
 class ChunkRequest(BaseModel):
     text: str
-    threshold: float = 0.7
-    max_chunk_tokens: int = 256
 class ClassificationResult(BaseModel):
     label: str
@@ -111,6 +172,7 @@ class ChunkResult(BaseModel):
     chunks: List[str]
 @app.get("/health")
 def health():
     return {"status": "ok"}
@@ -120,20 +182,18 @@ def health():
 def predict_contracts_clauses(req: TextRequest):
     preds = models["contracts_clauses"]([req.text])
     label_id = int(preds[0])
-    if hasattr(models["contracts_clauses"], "labels"):
-        label = models["contracts_clauses"].labels[label_id]
-    else:
-        label = f"class_{label_id}"
     return ClassificationResult(label=label, score=1.0)
 @app.post("/predict/nli", response_model=ClassificationResult)
 def predict_nli(req: PairRequest):
-    tok = models["nli_tokenizer"]
-    model = models["nli_model"]
-    inputs = tok(req.premise, req.hypothesis, return_tensors="pt", truncation=True)
     with torch.no_grad():
-        logits = model(**inputs).logits
         probs = torch.nn.functional.softmax(logits, dim=-1)
         class_id = torch.argmax(probs, dim=-1).item()
     return ClassificationResult(
@@ -144,11 +204,11 @@ def predict_nli(req: PairRequest):
 @app.post("/predict/risk", response_model=ClassificationResult)
 def predict_risk(req: TextRequest):
-    tok = models["risk_tokenizer"]
-    model = models["risk_model"]
-    inputs = tok(req.text, return_tensors="pt", truncation=True, max_length=512)
     with torch.no_grad():
-        logits = model(**inputs).logits
         probs = torch.nn.functional.softmax(logits, dim=-1)
         class_id = torch.argmax(probs, dim=-1).item()
     return ClassificationResult(
@@ -159,43 +219,27 @@ def predict_risk(req: TextRequest):
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output.last_hidden_state
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 @app.post("/predict/embeddings", response_model=EmbeddingResult)
 def get_embeddings(req: EmbeddingRequest):
-    tok = models["emb_tokenizer"]
-    model = models["emb_model"]
-    encoded = tok(req.texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
-        outputs = model(**encoded)
     embeddings = mean_pooling(outputs, encoded["attention_mask"])
     return EmbeddingResult(embeddings=embeddings.tolist())
 @app.post("/predict/semantic_chunks", response_model=ChunkResult)
 def semantic_chunking(req: ChunkRequest):
-    model = models["chunker"]
-    sentences = [s.strip() for s in req.text.replace('\n', ' ').split('.') if s.strip()]
-    if not sentences:
-        return ChunkResult(chunks=[req.text])
-    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
-    chunks = []
-    current_chunk = [sentences[0]]
-    current_emb = sentence_embeddings[0]
-    for i in range(1, len(sentences)):
-        sim = torch.nn.functional.cosine_similarity(current_emb, sentence_embeddings[i], dim=0).item()
-        if sim >= req.threshold:
-            current_chunk.append(sentences[i])
-            chunk_embs = torch.stack([sentence_embeddings[j] for j in range(i - len(current_chunk) + 1, i + 1)])
-            current_emb = torch.mean(chunk_embs, dim=0)
-        else:
-            chunks.append('. '.join(current_chunk) + '.')
-            current_chunk = [sentences[i]]
-            current_emb = sentence_embeddings[i]
-    if current_chunk:
-        chunks.append('. '.join(current_chunk) + '.')
     return ChunkResult(chunks=chunks)

 import torch
+import numpy as np
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
 from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
 from setfit import SetFitModel
 from typing import List
 models = {}
+# ---------- TextChunker (from Raubachm/sentence-transformers-semantic-chunker) ----------
+class TextChunker:
+    def __init__(self, st_model: SentenceTransformer):
+        self.model = st_model
+    def chunk(self, text: str, context_window: int = 1,
+              percentile_threshold: float = 95, min_chunk_size: int = 3) -> List[str]:
+        import nltk
+        nltk.download("punkt", quiet=True)
+        nltk.download("punkt_tab", quiet=True)
+        from nltk.tokenize import sent_tokenize
+        sentences = sent_tokenize(text)
+        if not sentences:
+            return [text]
+        contextualized = self._add_context(sentences, context_window)
+        embeddings = self.model.encode(contextualized)
+        distances = self._calculate_distances(embeddings)
+        if not distances:
+            return [text]
+        breakpoints = self._identify_breakpoints(distances, percentile_threshold)
+        initial_chunks = self._create_chunks(sentences, breakpoints)
+        chunk_embeddings = self.model.encode(initial_chunks)
+        final_chunks = self._merge_small_chunks(initial_chunks, chunk_embeddings, min_chunk_size)
+        return final_chunks
+    def _add_context(self, sentences, window_size):
+        result = []
+        for i in range(len(sentences)):
+            start = max(0, i - window_size)
+            end = min(len(sentences), i + window_size + 1)
+            result.append(" ".join(sentences[start:end]))
+        return result
+    def _calculate_distances(self, embeddings):
+        distances = []
+        for i in range(len(embeddings) - 1):
+            sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
+            distances.append(1 - sim)
+        return distances
+    def _identify_breakpoints(self, distances, threshold_percentile):
+        threshold = np.percentile(distances, threshold_percentile)
+        return [i for i, d in enumerate(distances) if d > threshold]
+    def _create_chunks(self, sentences, breakpoints):
+        chunks, start = [], 0
+        for bp in breakpoints:
+            chunks.append(" ".join(sentences[start:bp + 1]))
+            start = bp + 1
+        chunks.append(" ".join(sentences[start:]))
+        return chunks
+    def _merge_small_chunks(self, chunks, embeddings, min_size):
+        if len(chunks) <= 1:
+            return chunks
+        final_chunks = [chunks[0]]
+        merged_embeddings = [embeddings[0]]
+        for i in range(1, len(chunks) - 1):
+            if len(chunks[i].split(". ")) < min_size:
+                prev_sim = cosine_similarity([embeddings[i]], [merged_embeddings[-1]])[0][0]
+                next_sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
+                if prev_sim > next_sim:
+                    final_chunks[-1] = f"{final_chunks[-1]} {chunks[i]}"
+                    merged_embeddings[-1] = (merged_embeddings[-1] + embeddings[i]) / 2
+                else:
+                    chunks[i + 1] = f"{chunks[i]} {chunks[i + 1]}"
+                    embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
+            else:
+                final_chunks.append(chunks[i])
+                merged_embeddings.append(embeddings[i])
+        final_chunks.append(chunks[-1])
+        return final_chunks
+# ---------- Lifespan ----------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     print("Loading models...")
     )
     print("✓ contracts_clauses loaded")
+    # 2. Contract NLI
     print("Loading contract NLI model...")
     models["nli_tokenizer"] = AutoTokenizer.from_pretrained("Syamchand/contract-nli-bert")
     models["nli_model"] = AutoModelForSequenceClassification.from_pretrained(
     # 3. Clause risk classifier
     print("Loading clause risk classifier...")
+    models["risk_tokenizer"] = AutoTokenizer.from_pretrained("Syamchand/clause_risk_classifier")
     models["risk_model"] = AutoModelForSequenceClassification.from_pretrained(
         "Syamchand/clause_risk_classifier"
     )
     # 4. Legal BERT embeddings
     print("Loading legal BERT embeddings model...")
     models["emb_tokenizer"] = AutoTokenizer.from_pretrained("nlpaueb/bert-base-uncased-contracts")
+    models["emb_model"] = AutoModel.from_pretrained("nlpaueb/bert-base-uncased-contracts")
     models["emb_model"].eval()
     print("✓ legal BERT loaded")
+    # 5. Semantic chunker — load the backbone model specified in the Raubachm model card
     print("Loading semantic chunker model...")
+    st_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v1", device="cpu")
+    models["chunker"] = TextChunker(st_model)
     print("✓ semantic chunker loaded")
     print("All models ready!")
     yield
     models.clear()
 app = FastAPI(lifespan=lifespan)
 # ---------- Schemas ----------
 class TextRequest(BaseModel):
     text: str
 class ChunkRequest(BaseModel):
     text: str
+    percentile_threshold: float = 95.0
+    context_window: int = 1
+    min_chunk_size: int = 3
 class ClassificationResult(BaseModel):
     label: str
     chunks: List[str]
+# ---------- Endpoints ----------
 @app.get("/health")
 def health():
     return {"status": "ok"}
 def predict_contracts_clauses(req: TextRequest):
     preds = models["contracts_clauses"]([req.text])
     label_id = int(preds[0])
+    label = models["contracts_clauses"].labels[label_id] if hasattr(
+        models["contracts_clauses"], "labels") else f"class_{label_id}"
     return ClassificationResult(label=label, score=1.0)
 @app.post("/predict/nli", response_model=ClassificationResult)
 def predict_nli(req: PairRequest):
+    inputs = models["nli_tokenizer"](
+        req.premise, req.hypothesis, return_tensors="pt", truncation=True
+    )
     with torch.no_grad():
+        logits = models["nli_model"](**inputs).logits
         probs = torch.nn.functional.softmax(logits, dim=-1)
         class_id = torch.argmax(probs, dim=-1).item()
     return ClassificationResult(
 @app.post("/predict/risk", response_model=ClassificationResult)
 def predict_risk(req: TextRequest):
+    inputs = models["risk_tokenizer"](
+        req.text, return_tensors="pt", truncation=True, max_length=512
+    )
     with torch.no_grad():
+        logits = models["risk_model"](**inputs).logits
         probs = torch.nn.functional.softmax(logits, dim=-1)
         class_id = torch.argmax(probs, dim=-1).item()
     return ClassificationResult(
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output.last_hidden_state
+    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * mask_expanded, 1) / torch.clamp(mask_expanded.sum(1), min=1e-9)
 @app.post("/predict/embeddings", response_model=EmbeddingResult)
 def get_embeddings(req: EmbeddingRequest):
+    encoded = models["emb_tokenizer"](
+        req.texts, padding=True, truncation=True, return_tensors="pt"
+    )
     with torch.no_grad():
+        outputs = models["emb_model"](**encoded)
     embeddings = mean_pooling(outputs, encoded["attention_mask"])
     return EmbeddingResult(embeddings=embeddings.tolist())
 @app.post("/predict/semantic_chunks", response_model=ChunkResult)
 def semantic_chunking(req: ChunkRequest):
+    chunks = models["chunker"].chunk(
+        text=req.text,
+        context_window=req.context_window,
+        percentile_threshold=req.percentile_threshold,
+        min_chunk_size=req.min_chunk_size
+    )
     return ChunkResult(chunks=chunks)