Spaces:

santanche
/

clinical-embedding

Sleeping

App Files Files Community

santanche commited on Jan 30

Commit

29dff86

1 Parent(s): 2d5dc60

feat (comparison): three models comparison

Browse files

Files changed (5) hide show

app/clinical_embedding.py +268 -53
app/server_clinical_embedding.py +45 -52
app/static/browser/index.html +228 -108
app/verify_backend.py +86 -0
requirements.txt +2 -0

app/clinical_embedding.py CHANGED Viewed

@@ -1,67 +1,282 @@
 import numpy as np
-from transformers import pipeline
-from typing import List
-class ClinicalBERT:
     """
-    A wrapper class for Bio_ClinicalBERT model to generate sentence embeddings.
     """
-    def __init__(self, model_name: str = "emilyalsentzer/Bio_ClinicalBERT", device: int = -1):
         """
-        Initialize the ClinicalBERT model using pipeline.
-        Args:
-            model_name: The Hugging Face model identifier
-            device: Device to run the model on (-1 for CPU, 0 for first GPU, etc.)
         """
-        self.model_name = model_name
-        # Create feature extraction pipeline
-        print(f"Loading {model_name}...")
-        self.pipe = pipeline(
-            "feature-extraction",
-            model=model_name,
-            device=device
-        )
-        print(f"Model loaded successfully on device {device}")
-    def get_embeddings(self, sentences: List[str], pooling: str = 'cls') -> np.ndarray:
-        """
-        Generate embeddings for a list of sentences.
-        Args:
-            sentences: List of input sentences
-            pooling: Pooling strategy ('mean', 'cls', or 'max')
-        Returns:
-            numpy array of shape (num_sentences, embedding_dim)
-        """
         if not sentences:
             return np.array([])
-        # Get embeddings from pipeline
-        # The pipeline returns a list with shape (1, num_tokens, embedding_dim) per sentence
-        outputs = self.pipe(sentences)
-        # Apply pooling strategy to each sentence
-        embeddings = []
-        for sentence_output in outputs:
-            # Convert to numpy array and squeeze the first dimension
-            # Shape: (1, num_tokens, embedding_dim) -> (num_tokens, embedding_dim)
-            tokens_array = np.array(sentence_output).squeeze(0)
-            if pooling == 'cls':
-                # Use [CLS] token (first token)
-                embedding = tokens_array[0]
-            elif pooling == 'max':
-                # Max pooling across tokens (dim 0)
-                embedding = np.max(tokens_array, axis=0)
-            else:  # mean pooling (default)
-                # Average across all tokens (dim 0)
-                embedding = np.mean(tokens_array, axis=0)
-            embeddings.append(embedding)
-        # Stack embeddings into a 2D array: (num_sentences, embedding_dim)
-        return np.vstack(embeddings)

 import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+import re
+from typing import List, Tuple, Union, Optional
+import gensim.downloader as api
+from abc import ABC, abstractmethod
+class BaseEmbedder(ABC):
+    """Abstract base class for embedding models."""
+    @abstractmethod
+    def get_embeddings(self, sentences: List[str], pooling: str = 'cls') -> np.ndarray:
+        pass
+class BertEmbedder(BaseEmbedder):
     """
+    Wrapper for BERT-based models (ClinicalBERT, BERT, etc.)
     """
+    def __init__(self, model_name: str, device: int = -1):
+        self.output_hidden_states = True
+        self.device = "cuda" if device == 0 and torch.cuda.is_available() else "cpu"
+        print(f"Loading {model_name} on {self.device}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(self.device)
+        self.model.eval()
+        print(f"Model {model_name} loaded successfully.")
+    def _extract_bracketed_content(self, text: str) -> Tuple[str, List[Tuple[int, int]]]:
         """
+        Extracts multiple bracketed contents '...[target]...' -> '...target...', [(start, end), ...]
+        Returns the CLEANED text (no brackets) and a list of character span ranges for the targets.
         """
+        # Finds all occurrences of [content]
+        # We need to construct the full string WITHOUT brackets, but keeping track of where the content was.
+        # Regex to find [content]
+        # We process manually to construct the clean string and map indices
+        clean_text = ""
+        target_spans = [] # List of (start_char, end_char) in clean_text
+        cursor = 0
+        i = 0
+        while i < len(text):
+            if text[i] == '[':
+                # possible start of bracket
+                # find matching ']'
+                end_bracket = text.find(']', i)
+                if end_bracket != -1:
+                    # Found a bracket pair
+                    # Append text before bracket
+                    clean_text += text[cursor:i]
+                    # Content inside
+                    content = text[i+1:end_bracket]
+                    start_span = len(clean_text)
+                    clean_text += content
+                    end_span = len(clean_text)
+                    target_spans.append((start_span, end_span))
+                    cursor = end_bracket + 1
+                    i = end_bracket + 1
+                    continue
+            i += 1
+        # Append remaining text
+        clean_text += text[cursor:]
+        # If no brackets found, return original text and span covering entire text
+        if not target_spans:
+            return text, [(0, len(text))]
+        return clean_text, target_spans
+    def get_embeddings(self, sentences: List[str], pooling: str = 'cls') -> np.ndarray:
         if not sentences:
             return np.array([])
+        embeddings_list = []
+        for sent in sentences:
+            # Handle bracketed parsing
+            clean_text, target_spans = self._extract_bracketed_content(sent)
+            # Tokenize with offset mapping to align chars to tokens
+            inputs = self.tokenizer(
+                clean_text,
+                return_tensors="pt",
+                truncation=True,
+                padding=True, # Padding not strictly needed for size 1 but good practice
+                return_offsets_mapping=True
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask
+                )
+            # shape: (batch=1, seq_len, hidden_dim)
+            last_hidden_state = outputs.last_hidden_state[0]
+            offset_mapping = inputs.offset_mapping[0].cpu().numpy()
+            # Identify which tokens correspond to the target spans
+            # target_spans is a list of (start_char, end_char)
+            # We want to collect ALL tokens that fall within ANY of these spans
+            target_token_indices = []
+            # If pooling is CLS, we just take index 0, UNLESS specific brackets were requested?
+            # Requirement: "shows the embedding (CLS, max, or min) only of the part between brackets but in the context of the sentence"
+            # This implies if brackets exist, we pool over the TOKENS inside the brackets.
+            # If 'cls' is requested for a bracketed segment, it's ambiguous.
+            # Usually 'CLS' is for the whole sentence.
+            # If user asks for 'CLS' of a segment, maybe they mean 'mean' or it's invalid?
+            # However, let's assume if brackets are present:
+            # - mean/max: pool over target tokens.
+            # - cls: returns the [CLS] token of the WHOLE sentence might be misleading if they asked for specific part.
+            # BUT, usually 'CLS' represents the whole sequence.
+            # Let's interpret:
+            # If brackets present, we ONLY consider tokens inside brackets for mean/max.
+            # If CLS is requested with brackets, we might just fall back to MEAN of the brackets, OR return CLS of sentence?
+            # The prompt says: "shows the embedding (CLS, max, or min) only of the part between brackets"
+            # So for 'cls' it doesn't make sense on a sub-span.
+            # I will assume if brackets + CLS -> we just do MEAN of the span (as a reasonable fallback) OR I can treat the first token of the span as 'CLS'-like? No that's hacky.
+            # Let's stick to: if brackets exist, we gather those tokens. Then apply pooling.
+            # Find tokens
+            for token_idx, (start_offset, end_offset) in enumerate(offset_mapping):
+                if start_offset == 0 and end_offset == 0: continue # Special tokens like CLS/SEP often have 0,0 or similar
+                # Check if this token intersects with any target span
+                # offset is [start, end)
+                # span is [start, end)
+                is_in_target = False
+                for span_start, span_end in target_spans:
+                    # simplistic check: overlap
+                    # If token is largely inside the span
+                    if end_offset > span_start and start_offset < span_end:
+                         is_in_target = True
+                         break
+                if is_in_target:
+                    target_token_indices.append(token_idx)
+            # If no tokens found (e.g. brackets were empty or special chars?), fall back to full sentence (ignore CLS/SEP usually?)
+            # or if NO brackets were in input, we use full sequence (often excluding CLS/SEP for mean/max)
+            # Check if original had brackets
+            has_brackets = (clean_text != sent)
+            if not target_token_indices:
+                # No specific target, use all tokens (excluding CLS/SEP for mean/max usually)
+                # For BERT, tokens [1:-1] are the real words.
+                # If CLS requested, just take [0]
+                if pooling == 'cls':
+                    selected_tokens = last_hidden_state[0:1] # The [CLS]
+                else:
+                    # Use all tokens except CLS(0) and SEP(-1)
+                    if len(last_hidden_state) > 2:
+                        selected_tokens = last_hidden_state[1:-1]
+                    else:
+                        selected_tokens = last_hidden_state # Fallback
+            else:
+                # We have specific target tokens
+                selected_tokens = last_hidden_state[target_token_indices]
+            # Now Pool
+            if len(selected_tokens) == 0:
+                # Fallback to zero vector
+                embedding = np.zeros(self.model.config.hidden_size)
+            else:
+                if pooling == 'mean':
+                    embedding = torch.mean(selected_tokens, dim=0).cpu().numpy()
+                elif pooling == 'max':
+                    embedding = torch.max(selected_tokens, dim=0)[0].cpu().numpy()
+                elif pooling == 'cls':
+                    # If we have brackets, 'cls' is ambiguous.
+                    # If we selected specific tokens, 'cls' implies 'the representative vector'.
+                    # Let's just use MEAN for sub-spans if CLS is requested, or if no brackets, use actual CLS.
+                    if has_brackets:
+                        embedding = torch.mean(selected_tokens, dim=0).cpu().numpy()
+                    else:
+                        # Re-fetch CLS from original if we didn't select it above
+                        # (Above logic might have skipped it if we fell into 'no target tokens' branch)
+                        embedding = last_hidden_state[0].cpu().numpy()
+                else:
+                    embedding = torch.mean(selected_tokens, dim=0).cpu().numpy()
+            embeddings_list.append(embedding)
+        return np.vstack(embeddings_list)
+class Word2VecEmbedder(BaseEmbedder):
+    """
+    Wrapper for Word2Vec (using Gensim).
+    Since we don't have a local model, we'll try to load a small one or glove-wiki-gigaword-50.
+    """
+    def __init__(self, model_name: str = "glove-wiki-gigaword-50"):
+        print(f"Loading Word2Vec model {model_name}...")
+        try:
+            self.model = api.load(model_name)
+            print(f"Word2Vec model {model_name} loaded.")
+        except Exception as e:
+            print(f"Failed to load gensim model: {e}")
+            self.model = None
+    def _extract_words_and_brackets(self, text: str) -> List[str]:
+        """
+        Parses text to find words.
+        If brackets are present, ONLY returns words inside brackets.
+        If no brackets, returns all words.
+        """
+        # Check for brackets
+        targets = re.findall(r'\[(.*?)\]', text)
+        words = []
+        if targets:
+            # Process only content inside brackets
+            # Join them to treat as a stream of text to tokenize?
+            # Or just process each group.
+            full_target_text = " ".join(targets)
+            # Simple tokenization: split by space, remove punctuation
+            # Check availability in w2v vocab
+            raw_words = re.findall(r'\b\w+\b', full_target_text.lower())
+            words = raw_words
+        else:
+            # All words
+            words = re.findall(r'\b\w+\b', text.lower())
+        return words
+    def get_embeddings(self, sentences: List[str], pooling: str = 'cls') -> np.ndarray:
+        if self.model is None:
+            return np.array([])
+        embeddings_list = []
+        vector_size = self.model.vector_size
+        for sent in sentences:
+            words = self._extract_words_and_brackets(sent)
+            valid_vectors = []
+            for w in words:
+                if w in self.model:
+                    valid_vectors.append(self.model[w])
+            if not valid_vectors:
+                embeddings_list.append(np.zeros(vector_size))
+                continue
+            vectors_np = np.vstack(valid_vectors)
+            if pooling == 'max':
+                emb = np.max(vectors_np, axis=0)
+            else:
+                # Mean for 'mean' and 'cls' (w2v has no CLS)
+                emb = np.mean(vectors_np, axis=0)
+            embeddings_list.append(emb)
+        return np.vstack(embeddings_list)
+# Factory/Container
+class ModelManager:
+    def __init__(self):
+        self.models = {}
+    def get_model(self, model_type: str):
+        if model_type not in self.models:
+            if model_type == 'clinical_bert':
+                self.models[model_type] = BertEmbedder("emilyalsentzer/Bio_ClinicalBERT")
+            elif model_type == 'bert':
+                self.models[model_type] = BertEmbedder("bert-base-uncased")
+            elif model_type == 'word2vec':
+                self.models[model_type] = Word2VecEmbedder()
+            else:
+                raise ValueError(f"Unknown model type: {model_type}")
+        return self.models[model_type]

app/server_clinical_embedding.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 from fastapi import FastAPI, Query, UploadFile, File, HTTPException
 from fastapi.responses import RedirectResponse
 from fastapi.responses import StreamingResponse
@@ -11,25 +11,25 @@ import io
 import csv
 import os
-from clinical_embedding import ClinicalBERT
 # Pydantic models for request/response
 class EmbeddingRequest(BaseModel):
     sentences: List[str]
     pooling: str = 'cls'
 class EmbeddingResponse(BaseModel):
     embeddings: List[List[float]]
     shape: List[int]
     pooling: str
 # Initialize FastAPI app
 app = FastAPI(
-    title="Clinical BERT Embeddings API",
-    description="API for generating embeddings using Bio_ClinicalBERT model",
-    version="1.0.0"
 )
 # Add CORS middleware to allow web page access
@@ -44,16 +44,17 @@ app.add_middleware(
 # Serve static files
 app.mount("/app/static", StaticFiles(directory="static"), name="static")
-# Initialize model (global instance)
-clinical_bert = None
 @app.on_event("startup")
 async def startup_event():
-    """Load model on startup"""
-    global clinical_bert
-    clinical_bert = ClinicalBERT(device=-1)  # Use device=0 for GPU
 @app.get("/")
 async def root():
@@ -61,33 +62,29 @@ async def root():
 @app.get("/browser/")
 def get_browser():
-    print(os.path.join("static", "browser", "index.html"))
     return FileResponse(os.path.join("static", "browser", "index.html"))
 @app.get("/embeddings", response_model=EmbeddingResponse)
 async def get_embeddings(
     sentences: List[str] = Query(..., description="List of sentences to embed"),
-    pooling: str = Query('cls', description="Pooling strategy: mean, cls, or max")
 ):
     """
     Generate embeddings for a list of sentences.
-    Args:
-        sentences: List of input sentences
-        pooling: Pooling strategy ('mean', 'cls', or 'max')
-    Returns:
-        EmbeddingResponse with embeddings and metadata
     """
     # Validate pooling method
     if pooling not in ['mean', 'cls', 'max']:
-        return {
-            "error": "Invalid pooling method. Choose from: mean, cls, max"
-        }
     # Generate embeddings
-    embeddings = clinical_bert.get_embeddings(sentences, pooling=pooling)
     # Convert to list for JSON serialization
     embeddings_list = embeddings.tolist()
@@ -95,36 +92,34 @@ async def get_embeddings(
     return EmbeddingResponse(
         embeddings=embeddings_list,
         shape=list(embeddings.shape),
-        pooling=pooling
     )
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
-        "model_loaded": clinical_bert is not None
     }
 @app.post("/embeddings/batch")
 async def post_embeddings_batch(request: EmbeddingRequest):
     """
     POST endpoint for batch embedding generation.
-    Args:
-        request: EmbeddingRequest with sentences and pooling method
-    Returns:
-        EmbeddingResponse with embeddings and metadata
     """
     # Validate pooling method
     if request.pooling not in ['mean', 'cls', 'max']:
         raise HTTPException(status_code=400, detail="Invalid pooling method. Choose from: mean, cls, max")
     # Generate embeddings
-    embeddings = clinical_bert.get_embeddings(request.sentences, pooling=request.pooling)
     # Convert to list for JSON serialization
     embeddings_list = embeddings.tolist()
@@ -132,24 +127,18 @@ async def post_embeddings_batch(request: EmbeddingRequest):
     return EmbeddingResponse(
         embeddings=embeddings_list,
         shape=list(embeddings.shape),
-        pooling=request.pooling
     )
 @app.post("/embeddings/file")
 async def upload_file_embeddings(
     file: UploadFile = File(...),
-    pooling: str = Query('cls', description="Pooling strategy: mean, cls, or max")
 ):
     """
     Upload a CSV file with terms and get embeddings back as CSV.
-    Args:
-        file: CSV file with one column containing terms
-        pooling: Pooling strategy ('mean', 'cls', or 'max')
-    Returns:
-        CSV file with embeddings
     """
     # Validate file type
     if not file.filename.endswith('.csv'):
@@ -159,6 +148,11 @@ async def upload_file_embeddings(
     if pooling not in ['mean', 'cls', 'max']:
         raise HTTPException(status_code=400, detail="Invalid pooling method. Choose from: mean, cls, max")
     try:
         # Read CSV file
         contents = await file.read()
@@ -178,7 +172,7 @@ async def upload_file_embeddings(
             raise HTTPException(status_code=400, detail="No terms found in CSV")
         # Generate embeddings
-        embeddings = clinical_bert.get_embeddings(terms, pooling=pooling)
         # Create output CSV
         output = io.StringIO()
@@ -206,11 +200,10 @@ async def upload_file_embeddings(
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
 if __name__ == "__main__":
     # Run the server
     uvicorn.run(
-        "main:app",
         host="0.0.0.0",
         port=8000,
         reload=False

+from typing import List, Optional
 from fastapi import FastAPI, Query, UploadFile, File, HTTPException
 from fastapi.responses import RedirectResponse
 from fastapi.responses import StreamingResponse
 import csv
 import os
+from clinical_embedding import ModelManager
 # Pydantic models for request/response
 class EmbeddingRequest(BaseModel):
     sentences: List[str]
     pooling: str = 'cls'
+    model: str = 'clinical_bert'
 class EmbeddingResponse(BaseModel):
     embeddings: List[List[float]]
     shape: List[int]
     pooling: str
+    model: str
 # Initialize FastAPI app
 app = FastAPI(
+    title="Clinical Embedding API",
+    description="API for generating embeddings using various models (ClinicalBERT, BERT, Word2Vec)",
+    version="2.0.0"
 )
 # Add CORS middleware to allow web page access
 # Serve static files
 app.mount("/app/static", StaticFiles(directory="static"), name="static")
+# Initialize model manager (global instance)
+model_manager = ModelManager()
 @app.on_event("startup")
 async def startup_event():
+    """
+    Load default model on startup.
+    Other models will be loaded on demand (see ModelManager).
+    """
+    # Pre-load ClinicalBERT as it's the default
+    model_manager.get_model('clinical_bert')
 @app.get("/")
 async def root():
 @app.get("/browser/")
 def get_browser():
     return FileResponse(os.path.join("static", "browser", "index.html"))
 @app.get("/embeddings", response_model=EmbeddingResponse)
 async def get_embeddings(
     sentences: List[str] = Query(..., description="List of sentences to embed"),
+    pooling: str = Query('cls', description="Pooling strategy: mean, cls, or max"),
+    model: str = Query('clinical_bert', description="Model to use: clinical_bert, bert, word2vec")
 ):
     """
     Generate embeddings for a list of sentences.
+    Supports bracketed text for context-aware specific extraction.
     """
     # Validate pooling method
     if pooling not in ['mean', 'cls', 'max']:
+        raise HTTPException(status_code=400, detail="Invalid pooling method. Choose from: mean, cls, max")
+    try:
+        embedder = model_manager.get_model(model)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
     # Generate embeddings
+    embeddings = embedder.get_embeddings(sentences, pooling=pooling)
     # Convert to list for JSON serialization
     embeddings_list = embeddings.tolist()
     return EmbeddingResponse(
         embeddings=embeddings_list,
         shape=list(embeddings.shape),
+        pooling=pooling,
+        model=model
     )
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
+        "loaded_models": list(model_manager.models.keys())
     }
 @app.post("/embeddings/batch")
 async def post_embeddings_batch(request: EmbeddingRequest):
     """
     POST endpoint for batch embedding generation.
     """
     # Validate pooling method
     if request.pooling not in ['mean', 'cls', 'max']:
         raise HTTPException(status_code=400, detail="Invalid pooling method. Choose from: mean, cls, max")
+    try:
+        embedder = model_manager.get_model(request.model)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
     # Generate embeddings
+    embeddings = embedder.get_embeddings(request.sentences, pooling=request.pooling)
     # Convert to list for JSON serialization
     embeddings_list = embeddings.tolist()
     return EmbeddingResponse(
         embeddings=embeddings_list,
         shape=list(embeddings.shape),
+        pooling=request.pooling,
+        model=request.model
     )
 @app.post("/embeddings/file")
 async def upload_file_embeddings(
     file: UploadFile = File(...),
+    pooling: str = Query('cls', description="Pooling strategy: mean, cls, or max"),
+    model: str = Query('clinical_bert', description="Model to use: clinical_bert, bert, word2vec")
 ):
     """
     Upload a CSV file with terms and get embeddings back as CSV.
     """
     # Validate file type
     if not file.filename.endswith('.csv'):
     if pooling not in ['mean', 'cls', 'max']:
         raise HTTPException(status_code=400, detail="Invalid pooling method. Choose from: mean, cls, max")
+    try:
+        embedder = model_manager.get_model(model)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
     try:
         # Read CSV file
         contents = await file.read()
             raise HTTPException(status_code=400, detail="No terms found in CSV")
         # Generate embeddings
+        embeddings = embedder.get_embeddings(terms, pooling=pooling)
         # Create output CSV
         output = io.StringIO()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")
 if __name__ == "__main__":
     # Run the server
     uvicorn.run(
+        "server_clinical_embedding:app",
         host="0.0.0.0",
         port=8000,
         reload=False

app/static/browser/index.html CHANGED Viewed

@@ -1,5 +1,6 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
@@ -10,33 +11,33 @@
             padding: 0;
             box-sizing: border-box;
         }
         body {
             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             min-height: 100vh;
             padding: 20px;
         }
         .container {
             max-width: 1200px;
             margin: 0 auto;
         }
         h1 {
             color: white;
             text-align: center;
             margin-bottom: 30px;
             font-size: 2.5em;
-            text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
         }
         .tabs {
             display: flex;
             gap: 10px;
             margin-bottom: 20px;
         }
         .tab-button {
             flex: 1;
             padding: 15px;
@@ -49,41 +50,43 @@
             transition: all 0.3s;
             color: #667eea;
         }
         .tab-button:hover {
             background: #f0f0f0;
         }
         .tab-button.active {
             background: white;
             color: #764ba2;
-            box-shadow: 0 -2px 10px rgba(0,0,0,0.1);
         }
         .tab-content {
             display: none;
             background: white;
             padding: 30px;
             border-radius: 0 0 12px 12px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
         }
         .tab-content.active {
             display: block;
         }
         .form-group {
             margin-bottom: 20px;
         }
         label {
             display: block;
             margin-bottom: 8px;
             font-weight: bold;
             color: #333;
         }
-        textarea, input[type="file"], select {
             width: 100%;
             padding: 12px;
             border: 2px solid #e0e0e0;
@@ -92,17 +95,18 @@
             font-family: 'Courier New', monospace;
             transition: border-color 0.3s;
         }
-        textarea:focus, select:focus {
             outline: none;
             border-color: #667eea;
         }
         textarea {
             min-height: 150px;
             resize: vertical;
         }
         button {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
@@ -114,22 +118,22 @@
             cursor: pointer;
             transition: transform 0.2s, box-shadow 0.2s;
         }
         button:hover {
             transform: translateY(-2px);
             box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
         }
         button:active {
             transform: translateY(0);
         }
         button:disabled {
             background: #ccc;
             cursor: not-allowed;
             transform: none;
         }
         .loading {
             display: none;
             text-align: center;
@@ -137,11 +141,11 @@
             color: #667eea;
             font-weight: bold;
         }
         .loading.show {
             display: block;
         }
         .spinner {
             border: 4px solid #f3f3f3;
             border-top: 4px solid #667eea;
@@ -151,12 +155,17 @@
             animation: spin 1s linear infinite;
             margin: 0 auto 10px;
         }
         @keyframes spin {
-            0% { transform: rotate(0deg); }
-            100% { transform: rotate(360deg); }
         }
         .error {
             background: #fee;
             color: #c33;
@@ -166,11 +175,11 @@
             border-left: 4px solid #c33;
             display: none;
         }
         .error.show {
             display: block;
         }
         .success {
             background: #efe;
             color: #3c3;
@@ -180,11 +189,11 @@
             border-left: 4px solid #3c3;
             display: none;
         }
         .success.show {
             display: block;
         }
         .info {
             background: #e3f2fd;
             padding: 15px;
@@ -193,7 +202,7 @@
             color: #1976d2;
             border-left: 4px solid #1976d2;
         }
         .download-section {
             display: none;
             margin-top: 20px;
@@ -202,50 +211,51 @@
             border-radius: 6px;
             text-align: center;
         }
         .download-section.show {
             display: block;
         }
         .settings {
             display: flex;
             gap: 20px;
             align-items: end;
         }
         .settings .form-group {
             flex: 1;
         }
     </style>
 </head>
 <body>
     <div class="container">
         <h1>🧬 Clinical BERT Embeddings</h1>
         <div class="tabs">
             <button class="tab-button active" onclick="switchTab('inline')">📝 Inline Embeddings</button>
             <button class="tab-button" onclick="switchTab('file')">📁 File Embeddings</button>
         </div>
         <!-- Inline Embeddings Tab -->
         <div id="inline-tab" class="tab-content active">
             <div class="info">
                 💡 Enter medical terms separated by commas or new lines. Example: Heart Attack, Myocardial Infarction
             </div>
             <div class="error" id="inline-error"></div>
             <div class="success" id="inline-success"></div>
             <div class="settings">
                 <div class="form-group" style="flex: 3;">
                     <label for="inline-terms">Medical Terms:</label>
                     <textarea id="inline-terms" placeholder="Enter terms here (comma or newline separated)...
 Example:
-Heart Attack
-Myocardial Infarction
 Diabetes"></textarea>
                 </div>
                 <div class="form-group">
                     <label for="inline-pooling">Pooling:</label>
                     <select id="inline-pooling">
@@ -254,36 +264,68 @@ Diabetes"></textarea>
                         <option value="max">Max</option>
                     </select>
                 </div>
             </div>
-            <button onclick="getInlineEmbeddings()" id="inline-btn">Generate Embeddings</button>
             <div class="loading" id="inline-loading">
                 <div class="spinner"></div>
-                Processing...
             </div>
-            <div class="form-group" style="margin-top: 20px;">
-                <label for="inline-results">Embeddings (JSON):</label>
-                <textarea id="inline-results" readonly placeholder="Results will appear here..."></textarea>
             </div>
         </div>
         <!-- File Embeddings Tab -->
         <div id="file-tab" class="tab-content">
             <div class="info">
                 💡 Upload a CSV file with one column containing medical terms. The first row should be the column name.
             </div>
             <div class="error" id="file-error"></div>
             <div class="success" id="file-success"></div>
             <div class="settings">
-                <div class="form-group" style="flex: 3;">
                     <label for="file-input">Select CSV File:</label>
                     <input type="file" id="file-input" accept=".csv">
                 </div>
                 <div class="form-group">
                     <label for="file-pooling">Pooling:</label>
                     <select id="file-pooling">
@@ -292,15 +334,24 @@ Diabetes"></textarea>
                         <option value="max">Max</option>
                     </select>
                 </div>
             </div>
             <button onclick="uploadFileEmbeddings()" id="file-btn">Process File</button>
             <div class="loading" id="file-loading">
                 <div class="spinner"></div>
                 Processing file...
             </div>
             <div class="download-section" id="download-section">
                 <h3>✅ Embeddings Ready!</h3>
                 <p style="margin: 10px 0;">Your embeddings have been generated successfully.</p>
@@ -308,133 +359,201 @@ Diabetes"></textarea>
             </div>
         </div>
     </div>
     <script>
         const API_URL = 'https://santanche-clinical-embedding.hf.space';
         let downloadBlob = null;
         let downloadFilename = null;
         function switchTab(tab) {
             // Update tab buttons
             document.querySelectorAll('.tab-button').forEach(btn => {
                 btn.classList.remove('active');
             });
             event.target.classList.add('active');
             // Update tab content
             document.querySelectorAll('.tab-content').forEach(content => {
                 content.classList.remove('active');
             });
             document.getElementById(`${tab}-tab`).classList.add('active');
         }
         function showError(tabId, message) {
             const errorDiv = document.getElementById(`${tabId}-error`);
             errorDiv.textContent = message;
             errorDiv.classList.add('show');
             setTimeout(() => errorDiv.classList.remove('show'), 5000);
         }
         function showSuccess(tabId, message) {
             const successDiv = document.getElementById(`${tabId}-success`);
             successDiv.textContent = message;
             successDiv.classList.add('show');
             setTimeout(() => successDiv.classList.remove('show'), 5000);
         }
         async function getInlineEmbeddings() {
             const termsText = document.getElementById('inline-terms').value.trim();
             const pooling = document.getElementById('inline-pooling').value;
-            const resultsArea = document.getElementById('inline-results');
             const loadingDiv = document.getElementById('inline-loading');
             const btn = document.getElementById('inline-btn');
             if (!termsText) {
                 showError('inline', 'Please enter some terms');
                 return;
             }
-            // Parse terms (split by comma or newline)
             const terms = termsText
-                .split(/[,\n]+/)
                 .map(t => t.trim())
                 .filter(t => t.length > 0);
             if (terms.length === 0) {
                 showError('inline', 'No valid terms found');
                 return;
             }
             // Show loading
             loadingDiv.classList.add('show');
             btn.disabled = true;
-            resultsArea.value = '';
             try {
-                const response = await fetch(`${API_URL}/embeddings/batch`, {
-                    method: 'POST',
-                    headers: {
-                        'Content-Type': 'application/json',
-                    },
-                    body: JSON.stringify({
-                        sentences: terms,
-                        pooling: pooling
-                    })
                 });
-                if (!response.ok) {
-                    throw new Error(`HTTP error! status: ${response.status}`);
-                }
-                const data = await response.json();
-                resultsArea.value = JSON.stringify(data, null, 2);
-                showSuccess('inline', `Generated embeddings for ${terms.length} terms (shape: ${data.shape})`);
             } catch (error) {
-                showError('inline', `Error: ${error.message}`);
-                resultsArea.value = '';
             } finally {
                 loadingDiv.classList.remove('show');
                 btn.disabled = false;
             }
         }
         async function uploadFileEmbeddings() {
             const fileInput = document.getElementById('file-input');
             const pooling = document.getElementById('file-pooling').value;
             const loadingDiv = document.getElementById('file-loading');
             const btn = document.getElementById('file-btn');
             const downloadSection = document.getElementById('download-section');
             if (!fileInput.files || fileInput.files.length === 0) {
                 showError('file', 'Please select a CSV file');
                 return;
             }
             const file = fileInput.files[0];
             // Show loading
             loadingDiv.classList.add('show');
             btn.disabled = true;
             downloadSection.classList.remove('show');
             try {
                 const formData = new FormData();
                 formData.append('file', file);
-                const response = await fetch(`${API_URL}/embeddings/file?pooling=${pooling}`, {
                     method: 'POST',
                     body: formData
                 });
                 if (!response.ok) {
                     const errorData = await response.json();
                     throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
                 }
                 // Get the blob for download
                 downloadBlob = await response.blob();
                 downloadFilename = `embeddings_${file.name}`;
                 // Show download section
                 downloadSection.classList.add('show');
                 showSuccess('file', 'File processed successfully!');
@@ -446,13 +565,13 @@ Diabetes"></textarea>
                 btn.disabled = false;
             }
         }
         function downloadResults() {
             if (!downloadBlob) {
                 showError('file', 'No data to download');
                 return;
             }
             const url = window.URL.createObjectURL(downloadBlob);
             const a = document.createElement('a');
             a.href = url;
@@ -464,4 +583,5 @@ Diabetes"></textarea>
         }
     </script>
 </body>
-</html>

 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
             padding: 0;
             box-sizing: border-box;
         }
         body {
             font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             min-height: 100vh;
             padding: 20px;
         }
         .container {
             max-width: 1200px;
             margin: 0 auto;
         }
         h1 {
             color: white;
             text-align: center;
             margin-bottom: 30px;
             font-size: 2.5em;
+            text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
         }
         .tabs {
             display: flex;
             gap: 10px;
             margin-bottom: 20px;
         }
         .tab-button {
             flex: 1;
             padding: 15px;
             transition: all 0.3s;
             color: #667eea;
         }
         .tab-button:hover {
             background: #f0f0f0;
         }
         .tab-button.active {
             background: white;
             color: #764ba2;
+            box-shadow: 0 -2px 10px rgba(0, 0, 0, 0.1);
         }
         .tab-content {
             display: none;
             background: white;
             padding: 30px;
             border-radius: 0 0 12px 12px;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
         }
         .tab-content.active {
             display: block;
         }
         .form-group {
             margin-bottom: 20px;
         }
         label {
             display: block;
             margin-bottom: 8px;
             font-weight: bold;
             color: #333;
         }
+        textarea,
+        input[type="file"],
+        select {
             width: 100%;
             padding: 12px;
             border: 2px solid #e0e0e0;
             font-family: 'Courier New', monospace;
             transition: border-color 0.3s;
         }
+        textarea:focus,
+        select:focus {
             outline: none;
             border-color: #667eea;
         }
         textarea {
             min-height: 150px;
             resize: vertical;
         }
         button {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
             cursor: pointer;
             transition: transform 0.2s, box-shadow 0.2s;
         }
         button:hover {
             transform: translateY(-2px);
             box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
         }
         button:active {
             transform: translateY(0);
         }
         button:disabled {
             background: #ccc;
             cursor: not-allowed;
             transform: none;
         }
         .loading {
             display: none;
             text-align: center;
             color: #667eea;
             font-weight: bold;
         }
         .loading.show {
             display: block;
         }
         .spinner {
             border: 4px solid #f3f3f3;
             border-top: 4px solid #667eea;
             animation: spin 1s linear infinite;
             margin: 0 auto 10px;
         }
         @keyframes spin {
+            0% {
+                transform: rotate(0deg);
+            }
+            100% {
+                transform: rotate(360deg);
+            }
         }
         .error {
             background: #fee;
             color: #c33;
             border-left: 4px solid #c33;
             display: none;
         }
         .error.show {
             display: block;
         }
         .success {
             background: #efe;
             color: #3c3;
             border-left: 4px solid #3c3;
             display: none;
         }
         .success.show {
             display: block;
         }
         .info {
             background: #e3f2fd;
             padding: 15px;
             color: #1976d2;
             border-left: 4px solid #1976d2;
         }
         .download-section {
             display: none;
             margin-top: 20px;
             border-radius: 6px;
             text-align: center;
         }
         .download-section.show {
             display: block;
         }
         .settings {
             display: flex;
             gap: 20px;
             align-items: end;
         }
         .settings .form-group {
             flex: 1;
         }
     </style>
 </head>
 <body>
     <div class="container">
         <h1>🧬 Clinical BERT Embeddings</h1>
         <div class="tabs">
             <button class="tab-button active" onclick="switchTab('inline')">📝 Inline Embeddings</button>
             <button class="tab-button" onclick="switchTab('file')">📁 File Embeddings</button>
         </div>
         <!-- Inline Embeddings Tab -->
         <div id="inline-tab" class="tab-content active">
             <div class="info">
                 💡 Enter medical terms separated by commas or new lines. Example: Heart Attack, Myocardial Infarction
             </div>
             <div class="error" id="inline-error"></div>
             <div class="success" id="inline-success"></div>
             <div class="settings">
                 <div class="form-group" style="flex: 3;">
                     <label for="inline-terms">Medical Terms:</label>
                     <textarea id="inline-terms" placeholder="Enter terms here (comma or newline separated)...
 Example:
+The patient had a [heart attack] yesterday.
+[Myocardial Infarction] is serious.
 Diabetes"></textarea>
                 </div>
                 <div class="form-group">
                     <label for="inline-pooling">Pooling:</label>
                     <select id="inline-pooling">
                         <option value="max">Max</option>
                     </select>
                 </div>
+                <!-- Model selector removed for Inline tab -->
             </div>
+            <button onclick="getInlineEmbeddings()" id="inline-btn">Generate Embeddings (All Models)</button>
             <div class="loading" id="inline-loading">
                 <div class="spinner"></div>
+                Processing 3 models...
             </div>
+            <div id="results-container" style="display: none; margin-top: 20px;">
+                <!-- Clinical BERT -->
+                <div class="result-block">
+                    <h3
+                        style="color: #667eea; border-bottom: 2px solid #667eea; padding-bottom: 5px; margin-bottom: 10px;">
+                        🧬 Clinical BERT</h3>
+                    <label>Visualization:</label>
+                    <div id="viz-clinical_bert" style="margin-bottom: 15px;"></div>
+                    <label>JSON:</label>
+                    <textarea id="json-clinical_bert" readonly style="height: 100px;"></textarea>
+                </div>
+                <!-- Standard BERT -->
+                <div class="result-block" style="margin-top: 30px;">
+                    <h3
+                        style="color: #764ba2; border-bottom: 2px solid #764ba2; padding-bottom: 5px; margin-bottom: 10px;">
+                        🤖 Standard BERT</h3>
+                    <label>Visualization:</label>
+                    <div id="viz-bert" style="margin-bottom: 15px;"></div>
+                    <label>JSON:</label>
+                    <textarea id="json-bert" readonly style="height: 100px;"></textarea>
+                </div>
+                <!-- Word2Vec -->
+                <div class="result-block" style="margin-top: 30px;">
+                    <h3
+                        style="color: #2c3e50; border-bottom: 2px solid #2c3e50; padding-bottom: 5px; margin-bottom: 10px;">
+                        📚 Word2Vec</h3>
+                    <label>Visualization:</label>
+                    <div id="viz-word2vec" style="margin-bottom: 15px;"></div>
+                    <label>JSON:</label>
+                    <textarea id="json-word2vec" readonly style="height: 100px;"></textarea>
+                </div>
             </div>
         </div>
         <!-- File Embeddings Tab -->
         <div id="file-tab" class="tab-content">
             <div class="info">
                 💡 Upload a CSV file with one column containing medical terms. The first row should be the column name.
             </div>
             <div class="error" id="file-error"></div>
             <div class="success" id="file-success"></div>
             <div class="settings">
+                <div class="form-group" style="flex: 2;">
                     <label for="file-input">Select CSV File:</label>
                     <input type="file" id="file-input" accept=".csv">
                 </div>
                 <div class="form-group">
                     <label for="file-pooling">Pooling:</label>
                     <select id="file-pooling">
                         <option value="max">Max</option>
                     </select>
                 </div>
+                <div class="form-group">
+                    <label for="file-model">Model:</label>
+                    <select id="file-model">
+                        <option value="clinical_bert" selected>Clinical BERT</option>
+                        <option value="bert">Standard BERT</option>
+                        <option value="word2vec">Word2Vec</option>
+                    </select>
+                </div>
             </div>
             <button onclick="uploadFileEmbeddings()" id="file-btn">Process File</button>
             <div class="loading" id="file-loading">
                 <div class="spinner"></div>
                 Processing file...
             </div>
             <div class="download-section" id="download-section">
                 <h3>✅ Embeddings Ready!</h3>
                 <p style="margin: 10px 0;">Your embeddings have been generated successfully.</p>
             </div>
         </div>
     </div>
     <script>
         const API_URL = 'https://santanche-clinical-embedding.hf.space';
         let downloadBlob = null;
         let downloadFilename = null;
         function switchTab(tab) {
             // Update tab buttons
             document.querySelectorAll('.tab-button').forEach(btn => {
                 btn.classList.remove('active');
             });
             event.target.classList.add('active');
             // Update tab content
             document.querySelectorAll('.tab-content').forEach(content => {
                 content.classList.remove('active');
             });
             document.getElementById(`${tab}-tab`).classList.add('active');
         }
         function showError(tabId, message) {
             const errorDiv = document.getElementById(`${tabId}-error`);
             errorDiv.textContent = message;
             errorDiv.classList.add('show');
             setTimeout(() => errorDiv.classList.remove('show'), 5000);
         }
         function showSuccess(tabId, message) {
             const successDiv = document.getElementById(`${tabId}-success`);
             successDiv.textContent = message;
             successDiv.classList.add('show');
             setTimeout(() => successDiv.classList.remove('show'), 5000);
         }
+        function createHeatmap(data, sentences, containerId) {
+            const container = document.getElementById(containerId);
+            container.innerHTML = '';
+            data.forEach((embedding, index) => {
+                const sentence = sentences[index];
+                const rowTitle = document.createElement('div');
+                rowTitle.style.fontWeight = 'bold';
+                rowTitle.style.fontSize = '0.9em';
+                rowTitle.style.marginBottom = '2px';
+                rowTitle.style.marginTop = '10px';
+                rowTitle.textContent = `${index + 1}. ${sentence}`;
+                container.appendChild(rowTitle);
+                const row = document.createElement('div');
+                row.style.display = 'flex';
+                row.style.flexWrap = 'wrap';
+                row.style.gap = '1px';
+                row.style.maxWidth = '100%';
+                embedding.forEach(val => {
+                    const block = document.createElement('div');
+                    block.style.width = '8px';
+                    block.style.height = '12px';
+                    block.title = val.toFixed(4);
+                    const intensity = Math.min(Math.abs(val) * 2, 1);
+                    if (val > 0) {
+                        block.style.backgroundColor = `rgba(0, 0, 255, ${intensity})`;
+                    } else {
+                        block.style.backgroundColor = `rgba(255, 0, 0, ${intensity})`;
+                    }
+                    row.appendChild(block);
+                });
+                container.appendChild(row);
+            });
+        }
+        async function fetchModelEmbeddings(modelName, terms, pooling) {
+            const response = await fetch(`${API_URL}/embeddings/batch`, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({
+                    sentences: terms,
+                    pooling: pooling,
+                    model: modelName
+                })
+            });
+            if (!response.ok) {
+                const err = await response.json();
+                throw new Error(err.detail || `HTTP error ${response.status}`);
+            }
+            return await response.json();
+        }
         async function getInlineEmbeddings() {
             const termsText = document.getElementById('inline-terms').value.trim();
             const pooling = document.getElementById('inline-pooling').value;
             const loadingDiv = document.getElementById('inline-loading');
             const btn = document.getElementById('inline-btn');
+            const resultsContainer = document.getElementById('results-container');
             if (!termsText) {
                 showError('inline', 'Please enter some terms');
                 return;
             }
             const terms = termsText
+                .split(/\n+/)
                 .map(t => t.trim())
                 .filter(t => t.length > 0);
             if (terms.length === 0) {
                 showError('inline', 'No valid terms found');
                 return;
             }
             // Show loading
             loadingDiv.classList.add('show');
             btn.disabled = true;
+            resultsContainer.style.display = 'none';
+            // Clear previous results
+            ['clinical_bert', 'bert', 'word2vec'].forEach(m => {
+                document.getElementById(`viz-${m}`).innerHTML = '';
+                document.getElementById(`json-${m}`).value = '';
+            });
             try {
+                // Fetch all 3 models in parallel
+                const models = ['clinical_bert', 'bert', 'word2vec'];
+                const promises = models.map(m => fetchModelEmbeddings(m, terms, pooling)
+                    .then(data => ({ status: 'fulfilled', model: m, data: data }))
+                    .catch(err => ({ status: 'rejected', model: m, error: err }))
+                );
+                const results = await Promise.all(promises);
+                resultsContainer.style.display = 'block';
+                results.forEach(res => {
+                    const jsonArea = document.getElementById(`json-${res.model}`);
+                    if (res.status === 'fulfilled') {
+                        jsonArea.value = JSON.stringify(res.data, null, 2);
+                        createHeatmap(res.data.embeddings, terms, `viz-${res.model}`);
+                    } else {
+                        jsonArea.value = `Error: ${res.error.message}`;
+                    }
                 });
+                showSuccess('inline', `Generated embeddings for ${terms.length} terms across 3 models.`);
             } catch (error) {
+                showError('inline', `Critical Error: ${error.message}`);
             } finally {
                 loadingDiv.classList.remove('show');
                 btn.disabled = false;
             }
         }
         async function uploadFileEmbeddings() {
             const fileInput = document.getElementById('file-input');
             const pooling = document.getElementById('file-pooling').value;
+            const model = document.getElementById('file-model').value;
             const loadingDiv = document.getElementById('file-loading');
             const btn = document.getElementById('file-btn');
             const downloadSection = document.getElementById('download-section');
             if (!fileInput.files || fileInput.files.length === 0) {
                 showError('file', 'Please select a CSV file');
                 return;
             }
             const file = fileInput.files[0];
             // Show loading
             loadingDiv.classList.add('show');
             btn.disabled = true;
             downloadSection.classList.remove('show');
             try {
                 const formData = new FormData();
                 formData.append('file', file);
+                const response = await fetch(`${API_URL}/embeddings/file?pooling=${pooling}&model=${model}`, {
                     method: 'POST',
                     body: formData
                 });
                 if (!response.ok) {
                     const errorData = await response.json();
                     throw new Error(errorData.detail || `HTTP error! status: ${response.status}`);
                 }
                 // Get the blob for download
                 downloadBlob = await response.blob();
                 downloadFilename = `embeddings_${file.name}`;
                 // Show download section
                 downloadSection.classList.add('show');
                 showSuccess('file', 'File processed successfully!');
                 btn.disabled = false;
             }
         }
         function downloadResults() {
             if (!downloadBlob) {
                 showError('file', 'No data to download');
                 return;
             }
             const url = window.URL.createObjectURL(downloadBlob);
             const a = document.createElement('a');
             a.href = url;
         }
     </script>
 </body>
+</html>

app/verify_backend.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from clinical_embedding import ModelManager
+import numpy as np
+def test_models():
+    mm = ModelManager()
+    print("Testing ClinicalBERT...")
+    cbert = mm.get_model('clinical_bert')
+    emb_cbert = cbert.get_embeddings(["Patient has [heart attack]."])
+    print(f"ClinicalBERT embedding shape: {emb_cbert.shape}")
+    assert emb_cbert.shape[1] == 768
+    print("\nTesting Standard BERT...")
+    bert = mm.get_model('bert')
+    emb_bert = bert.get_embeddings(["Patient has [heart attack]."])
+    print(f"Standard BERT embedding shape: {emb_bert.shape}")
+    assert emb_bert.shape[1] == 768
+    print("\nTesting Word2Vec (loading might fail if model not found, checking fail-safe)...")
+    try:
+        w2v = mm.get_model('word2vec')
+        if w2v.model:
+            emb_w2v = w2v.get_embeddings(["Patient has [heart attack]."])
+            print(f"Word2Vec embedding shape: {emb_w2v.shape}")
+            # Glove 50
+            if emb_w2v.size > 0:
+                assert emb_w2v.shape[1] == 50
+        else:
+            print("Word2Vec model could not be loaded (expected if no internet/file), skipping assertion.")
+    except Exception as e:
+        print(f"Word2Vec test error: {e}")
+def test_brackets():
+    mm = ModelManager()
+    model = mm.get_model('clinical_bert')
+    # Test case 1: Context matters?
+    # Ideally "apple" in "eat [apple]" vs "company [apple]" might differ slightly in BERT even if focused?
+    # Actually if we extract only [apple], the context IS used in the forward pass, then we select tokens.
+    s1 = "I like to eat [apple] pie."
+    s2 = "I bought stock in [apple] computer."
+    emb1 = model.get_embeddings([s1])
+    emb2 = model.get_embeddings([s2])
+    # Compute cosine similarity
+    sim = np.dot(emb1[0], emb2[0]) / (np.linalg.norm(emb1[0]) * np.linalg.norm(emb2[0]))
+    print(f"\nSimilarity between '[apple]' in food context vs tech context: {sim:.4f}")
+    # If they are exactly 1.0, then context wasn't used effectively or they are just identical tokens.
+    # BERT contextual embeddings should differ.
+    if sim < 0.99:
+        print("SUCCESS: Embeddings are different (context aware).")
+    else:
+        print("WARNING: Embeddings are very similar. Might be expected if tokenization is identical and context weak, or logic flaw.")
+    # Test case 2: Brackets vs No Brackets
+    s3 = "heart attack"
+    s4 = "[heart attack]"
+    # Should be identical if s3 is sent as is?
+    # Wait, s3 "heart attack" -> full sentence embedding.
+    # s4 "[heart attack]" -> extract "heart attack", full sentence is "heart attack".
+    # They should be arguably the same.
+    emb3 = model.get_embeddings([s3])
+    emb4 = model.get_embeddings([s4])
+    sim_ident = np.dot(emb3[0], emb4[0]) / (np.linalg.norm(emb3[0]) * np.linalg.norm(emb4[0]))
+    print(f"Similarity between 'heart attack' and '[heart attack]': {sim_ident:.4f}")
+    # Test case 3: Bracket subset
+    s5 = "The patient had a [heart attack] yesterday."
+    emb5 = model.get_embeddings([s5])
+    # Compare emb5 (just heart attack) with emb3 (heart attack in isolation)
+    # They should be different because emb5 has context "The patient had a... yesterday"
+    sim_context = np.dot(emb5[0], emb3[0]) / (np.linalg.norm(emb5[0]) * np.linalg.norm(emb3[0]))
+    print(f"Similarity between 'heart attack' (isolated) and '...[heart attack]...' (context): {sim_context:.4f}")
+    assert sim_context < 0.99, "Context should affect embedding"
+if __name__ == "__main__":
+    print("=== Running Backend Verification ===")
+    test_models()
+    test_brackets()
+    print("\n=== Verification Complete ===")

requirements.txt CHANGED Viewed

@@ -8,6 +8,8 @@ python-multipart==0.0.6
 transformers==4.35.2
 torch==2.1.1
 numpy==1.24.3
 # Optional: for GPU support, also install:
 # torch==2.1.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html

 transformers==4.35.2
 torch==2.1.1
 numpy==1.24.3
+gensim==4.3.2
+scikit-learn==1.3.2
 # Optional: for GPU support, also install:
 # torch==2.1.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html