Piyush Bhoyar commited on
Commit
92179c3
·
1 Parent(s): bdce192

Add application file

Browse files
Files changed (5) hide show
  1. .dockerignore +33 -0
  2. .gitignore +56 -0
  3. Dockerfile +18 -0
  4. main.py +160 -0
  5. requirements.txt +10 -0
.dockerignore ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Python Cache
6
+ __pycache__
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+
11
+ # Local Virtual Environments (Crucial to ignore)
12
+ venv
13
+ env
14
+ .env
15
+ .venv
16
+
17
+ # Local Model Folders (Docker will download a fresh copy on Linux)
18
+ dnabert_local
19
+ *.bin
20
+ *.safetensors
21
+
22
+ # Mac System Files
23
+ .DS_Store
24
+
25
+ # Frontend Artifacts (If you have them in the same folder)
26
+ node_modules
27
+ .next
28
+ build
29
+ dist
30
+
31
+ # Docker
32
+ Dockerfile
33
+ .dockerignore
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Python ---
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ .venv/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # --- FastAPI / Environment ---
27
+ .env
28
+ .env.local
29
+
30
+ # --- Node.js / Next.js ---
31
+ node_modules/
32
+ /.next/
33
+ /out/
34
+ .next
35
+ npm-debug.log*
36
+ yarn-debug.log*
37
+ yarn-error.log*
38
+ .pnpm-debug.log*
39
+
40
+ # --- Project Specific (IMPORTANT) ---
41
+ # Don't push the massive AI model to GitHub
42
+ dnabert_local/
43
+ *.bin
44
+ *.safetensors
45
+ *.pth
46
+ *.pt
47
+
48
+ # --- Mac OS ---
49
+ .DS_Store
50
+ .AppleDouble
51
+ .LSOverride
52
+
53
+ # --- IDEs ---
54
+ .vscode/
55
+ .idea/
56
+ *.swp
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
6
+
7
+ RUN pip install --no-cache-dir --upgrade pip
8
+
9
+ RUN pip install --no-cache-dir "torch>=2.6.0"
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY main.py .
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import shutil
4
+ import uuid
5
+ import json
6
+ import torch
7
+ import numpy as np
8
+ import io
9
+ from collections import Counter
10
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
11
+ from transformers import AutoTokenizer, AutoModel
12
+ from sklearn.cluster import HDBSCAN
13
+ from Bio import SeqIO
14
+ from huggingface_hub import snapshot_download
15
+
16
+
17
+ print("Initializing BioSentinel...")
18
+
19
+ def load_cpu_safe_model():
20
+ checkpoint = "zhihan1996/DNABERT-2-117M"
21
+ local_path = "./dnabert_local"
22
+
23
+ if not os.path.exists(local_path):
24
+ print("Downloading model...")
25
+ snapshot_download(repo_id=checkpoint, local_dir=local_path)
26
+
27
+ config_path = os.path.join(local_path, "config.json")
28
+ if os.path.exists(config_path):
29
+ with open(config_path, "r") as f:
30
+ config_data = json.load(f)
31
+
32
+ config_data["use_flash_attn"] = False
33
+ config_data["auto_map"] = {
34
+ "AutoConfig": "configuration_bert.BertConfig",
35
+ "AutoModel": "bert_layers.BertModel",
36
+ "AutoModelForMaskedLM": "bert_layers.BertForMaskedLM"
37
+ }
38
+
39
+ with open(config_path, "w") as f:
40
+ json.dump(config_data, f, indent=2)
41
+
42
+ bert_layer_path = os.path.join(local_path, "bert_layers.py")
43
+ if os.path.exists(bert_layer_path):
44
+ with open(bert_layer_path, "r") as f: content = f.read()
45
+
46
+
47
+ if "from .flash_attn_triton import" in content:
48
+ content = content.replace(
49
+ "from .flash_attn_triton import flash_attn_qkvpacked_func",
50
+ "flash_attn_qkvpacked_func = None # Patched"
51
+ )
52
+
53
+ content = content.replace(
54
+ "from .flash_attn_triton import flash_attn_func",
55
+ "flash_attn_func = None # Patched"
56
+ )
57
+
58
+
59
+ content = content.replace("self.config.use_flash_attn", "False")
60
+
61
+ with open(bert_layer_path, "w") as f: f.write(content)
62
+
63
+ print("Configuration patched for CPU.")
64
+
65
+
66
+ tokenizer = AutoTokenizer.from_pretrained(local_path, trust_remote_code=True)
67
+ model = AutoModel.from_pretrained(local_path, trust_remote_code=True)
68
+
69
+ return tokenizer, model
70
+
71
+
72
+ try:
73
+ tokenizer, model = load_cpu_safe_model()
74
+ device = torch.device("cpu")
75
+ model.to(device)
76
+ model.eval()
77
+ print("AI Engine Ready.")
78
+ except Exception as e:
79
+ print(f"Error: {e}")
80
+ sys.exit(1)
81
+
82
+
83
+ app = FastAPI()
84
+ UPLOAD_DIR = "temp_uploads"
85
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
86
+
87
+ def get_embeddings(sequences, batch_size=4):
88
+ all_embeddings = []
89
+
90
+ for i in range(0, len(sequences), batch_size):
91
+ batch = sequences[i : i + batch_size]
92
+ try:
93
+ inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
94
+ inputs = {k: v.to(device) for k, v in inputs.items()}
95
+
96
+ with torch.no_grad():
97
+ outputs = model(**inputs)
98
+
99
+ if isinstance(outputs, tuple):
100
+ hidden_states = outputs[0]
101
+ else:
102
+ hidden_states = outputs.last_hidden_state
103
+
104
+
105
+ attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden_states.size())
106
+ sum_embeddings = torch.sum(hidden_states * attention_mask, 1)
107
+ sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
108
+ mean_embeddings = sum_embeddings / sum_mask
109
+ all_embeddings.append(mean_embeddings.cpu().numpy())
110
+ except Exception as e:
111
+ print(f"Embedding Error: {e}")
112
+
113
+ return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.array([])
114
+
115
+ @app.websocket("/ws")
116
+ async def websocket_endpoint(websocket: WebSocket):
117
+ await websocket.accept()
118
+ try:
119
+ while True:
120
+
121
+ data = await websocket.receive_text()
122
+
123
+
124
+ if data.strip().startswith("@"): # FASTQ
125
+ sequences = []
126
+ try:
127
+ for record in SeqIO.parse(io.StringIO(data), "fastq"):
128
+ sequences.append(str(record.seq))
129
+ except:
130
+ sequences = data.splitlines()
131
+ else:
132
+ sequences = [s.strip() for s in data.splitlines() if s.strip()]
133
+
134
+ if len(sequences) < 2:
135
+ await websocket.send_json({"error": "Send at least 2 sequences"})
136
+ continue
137
+
138
+ await websocket.send_json({"status": "processing", "msg": f"Processing {len(sequences)} reads..."})
139
+
140
+
141
+ embeddings = get_embeddings(sequences)
142
+
143
+
144
+ clusterer = HDBSCAN(min_cluster_size=2, metric='euclidean')
145
+ labels = clusterer.fit_predict(embeddings)
146
+
147
+ counts = dict(Counter(labels.tolist()))
148
+
149
+
150
+ response = {
151
+ "status": "complete",
152
+ "total": len(sequences),
153
+ "clusters": len(counts) - (1 if -1 in counts else 0),
154
+ "anomalies": counts.get(-1, 0),
155
+ "data": [{"cluster": int(l), "seq_idx": i} for i, l in enumerate(labels)]
156
+ }
157
+ await websocket.send_json(response)
158
+
159
+ except WebSocketDisconnect:
160
+ print("Disconnected")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ scikit-learn
5
+ biopython
6
+ numpy
7
+ sentencepiece
8
+ protobuf
9
+ einops
10
+ huggingface_hub