Spaces:

piuss
/

BioSentinel-API

Sleeping

App Files Files Community

Piyush Bhoyar commited on Jan 18

Commit

92179c3

1 Parent(s): bdce192

Add application file

Browse files

Files changed (5) hide show

.dockerignore +33 -0
.gitignore +56 -0
Dockerfile +18 -0
main.py +160 -0
requirements.txt +10 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Git
+.git
+.gitignore
+# Python Cache
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+# Local Virtual Environments (Crucial to ignore)
+venv
+env
+.env
+.venv
+# Local Model Folders (Docker will download a fresh copy on Linux)
+dnabert_local
+*.bin
+*.safetensors
+# Mac System Files
+.DS_Store
+# Frontend Artifacts (If you have them in the same folder)
+node_modules
+.next
+build
+dist
+# Docker
+Dockerfile
+.dockerignore

.gitignore ADDED Viewed

	@@ -0,0 +1,56 @@

+# --- Python ---
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+.venv/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# --- FastAPI / Environment ---
+.env
+.env.local
+# --- Node.js / Next.js ---
+node_modules/
+/.next/
+/out/
+.next
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+# --- Project Specific (IMPORTANT) ---
+# Don't push the massive AI model to GitHub
+dnabert_local/
+*.bin
+*.safetensors
+*.pth
+*.pt
+# --- Mac OS ---
+.DS_Store
+.AppleDouble
+.LSOverride
+# --- IDEs ---
+.vscode/
+.idea/
+*.swp

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir "torch>=2.6.0"
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY main.py .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import sys
+import shutil
+import uuid
+import json
+import torch
+import numpy as np
+import io
+from collections import Counter
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from transformers import AutoTokenizer, AutoModel
+from sklearn.cluster import HDBSCAN
+from Bio import SeqIO
+from huggingface_hub import snapshot_download
+print("Initializing BioSentinel...")
+def load_cpu_safe_model():
+    checkpoint = "zhihan1996/DNABERT-2-117M"
+    local_path = "./dnabert_local"
+    if not os.path.exists(local_path):
+        print("Downloading model...")
+        snapshot_download(repo_id=checkpoint, local_dir=local_path)
+    config_path = os.path.join(local_path, "config.json")
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config_data = json.load(f)
+        config_data["use_flash_attn"] = False
+        config_data["auto_map"] = {
+            "AutoConfig": "configuration_bert.BertConfig",
+            "AutoModel": "bert_layers.BertModel",
+            "AutoModelForMaskedLM": "bert_layers.BertForMaskedLM"
+        }
+        with open(config_path, "w") as f:
+            json.dump(config_data, f, indent=2)
+    bert_layer_path = os.path.join(local_path, "bert_layers.py")
+    if os.path.exists(bert_layer_path):
+        with open(bert_layer_path, "r") as f: content = f.read()
+        if "from .flash_attn_triton import" in content:
+            content = content.replace(
+                "from .flash_attn_triton import flash_attn_qkvpacked_func",
+                "flash_attn_qkvpacked_func = None # Patched"
+            )
+            content = content.replace(
+                "from .flash_attn_triton import flash_attn_func",
+                "flash_attn_func = None # Patched"
+            )
+        content = content.replace("self.config.use_flash_attn", "False")
+        with open(bert_layer_path, "w") as f: f.write(content)
+    print("Configuration patched for CPU.")
+    tokenizer = AutoTokenizer.from_pretrained(local_path, trust_remote_code=True)
+    model = AutoModel.from_pretrained(local_path, trust_remote_code=True)
+    return tokenizer, model
+try:
+    tokenizer, model = load_cpu_safe_model()
+    device = torch.device("cpu")
+    model.to(device)
+    model.eval()
+    print("AI Engine Ready.")
+except Exception as e:
+    print(f"Error: {e}")
+    sys.exit(1)
+app = FastAPI()
+UPLOAD_DIR = "temp_uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+def get_embeddings(sequences, batch_size=4):
+    all_embeddings = []
+    for i in range(0, len(sequences), batch_size):
+        batch = sequences[i : i + batch_size]
+        try:
+            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if isinstance(outputs, tuple):
+                    hidden_states = outputs[0]
+                else:
+                    hidden_states = outputs.last_hidden_state
+                attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden_states.size())
+                sum_embeddings = torch.sum(hidden_states * attention_mask, 1)
+                sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
+                mean_embeddings = sum_embeddings / sum_mask
+                all_embeddings.append(mean_embeddings.cpu().numpy())
+        except Exception as e:
+            print(f"Embedding Error: {e}")
+    return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.array([])
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        while True:
+            data = await websocket.receive_text()
+            if data.strip().startswith("@"): # FASTQ
+                sequences = []
+                try:
+                    for record in SeqIO.parse(io.StringIO(data), "fastq"):
+                        sequences.append(str(record.seq))
+                except:
+                    sequences = data.splitlines()
+            else:
+                sequences = [s.strip() for s in data.splitlines() if s.strip()]
+            if len(sequences) < 2:
+                await websocket.send_json({"error": "Send at least 2 sequences"})
+                continue
+            await websocket.send_json({"status": "processing", "msg": f"Processing {len(sequences)} reads..."})
+            embeddings = get_embeddings(sequences)
+            clusterer = HDBSCAN(min_cluster_size=2, metric='euclidean')
+            labels = clusterer.fit_predict(embeddings)
+            counts = dict(Counter(labels.tolist()))
+            response = {
+                "status": "complete",
+                "total": len(sequences),
+                "clusters": len(counts) - (1 if -1 in counts else 0),
+                "anomalies": counts.get(-1, 0),
+                "data": [{"cluster": int(l), "seq_idx": i} for i, l in enumerate(labels)]
+            }
+            await websocket.send_json(response)
+    except WebSocketDisconnect:
+        print("Disconnected")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn[standard]
+transformers
+scikit-learn
+biopython
+numpy
+sentencepiece
+protobuf
+einops
+huggingface_hub