brocks1234 commited on
Commit
18bdcac
·
verified ·
1 Parent(s): a669541

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +34 -12
handler.py CHANGED
@@ -1,18 +1,34 @@
1
  import torch
 
2
  from typing import Any, Dict, List
 
 
 
 
3
  from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
4
 
5
  class EndpointHandler:
6
  def __init__(self, path=""):
7
- # We explicitly load the config first to satisfy the toolkit's hunger for info
8
- self.config = AutoConfig.from_pretrained(path, trust_remote_code=True)
 
9
 
10
- self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
11
 
12
- # We load the model using the config we just initialized
13
  self.model = AutoModelForMaskedLM.from_pretrained(
14
- path,
15
- config=self.config,
16
  trust_remote_code=True
17
  )
18
 
@@ -21,12 +37,12 @@ class EndpointHandler:
21
  self.model.eval()
22
 
23
  def __call__(self, data: Dict[str, Any]) -> List[float]:
24
- # Handle input safely
25
- inputs = data.get("inputs", data)
26
  if isinstance(inputs, list):
27
  inputs = inputs[0]
28
 
29
- # Use 1000bp chunks for the 12.2kb APRIL promoter
30
  chunk_size = 1000
31
  stride = 500
32
  chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), stride)]
@@ -34,15 +50,21 @@ class EndpointHandler:
34
  all_embeddings = []
35
  with torch.no_grad():
36
  for chunk in chunks:
37
- tokens = self.tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=chunk_size)
 
 
 
 
 
 
38
  if torch.cuda.is_available():
39
  tokens = {k: v.to("cuda") for k, v in tokens.items()}
40
 
41
  outputs = self.model(**tokens, output_hidden_states=True)
42
- # Mean pool last hidden state
43
  chunk_emb = torch.mean(outputs.hidden_states[-1], dim=1).squeeze()
44
  all_embeddings.append(chunk_emb)
45
 
46
- # Average the chunks into one vector for LangGraph
47
  final_embedding = torch.stack(all_embeddings).mean(dim=0).cpu().numpy().tolist()
48
  return final_embedding
 
1
  import torch
2
+ import os
3
  from typing import Any, Dict, List
4
+
5
+ # Force the environment variable inside the script as well
6
+ os.environ["HF_HUB_TRUST_REMOTE_CODE"] = "True"
7
+
8
  from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM
9
 
10
  class EndpointHandler:
11
  def __init__(self, path=""):
12
+ # We point to the specific InstaDeep model directly to avoid
13
+ # any local repository naming conflicts during the 'path' resolution
14
+ self.model_id = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"
15
 
16
+ # 1. Load Config first with explicit trust
17
+ self.config = AutoConfig.from_pretrained(
18
+ self.model_id,
19
+ trust_remote_code=True
20
+ )
21
+
22
+ # 2. Load Tokenizer
23
+ self.tokenizer = AutoTokenizer.from_pretrained(
24
+ self.model_id,
25
+ trust_remote_code=True
26
+ )
27
 
28
+ # 3. Load Model
29
  self.model = AutoModelForMaskedLM.from_pretrained(
30
+ self.model_id,
31
+ config=self.config,
32
  trust_remote_code=True
33
  )
34
 
 
37
  self.model.eval()
38
 
39
  def __call__(self, data: Dict[str, Any]) -> List[float]:
40
+ # Handle inputs from the toolkit JSON
41
+ inputs = data.pop("inputs", data)
42
  if isinstance(inputs, list):
43
  inputs = inputs[0]
44
 
45
+ # APRIL Promoter Chunking (12.2kb)
46
  chunk_size = 1000
47
  stride = 500
48
  chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), stride)]
 
50
  all_embeddings = []
51
  with torch.no_grad():
52
  for chunk in chunks:
53
+ tokens = self.tokenizer(
54
+ chunk,
55
+ return_tensors='pt',
56
+ padding=True,
57
+ truncation=True,
58
+ max_length=chunk_size
59
+ )
60
  if torch.cuda.is_available():
61
  tokens = {k: v.to("cuda") for k, v in tokens.items()}
62
 
63
  outputs = self.model(**tokens, output_hidden_states=True)
64
+ # Last hidden state mean pooling
65
  chunk_emb = torch.mean(outputs.hidden_states[-1], dim=1).squeeze()
66
  all_embeddings.append(chunk_emb)
67
 
68
+ # Average the chunks for one representative vector
69
  final_embedding = torch.stack(all_embeddings).mean(dim=0).cpu().numpy().tolist()
70
  return final_embedding