uddeshya-k commited on
Commit
7171285
·
verified ·
1 Parent(s): fdc8930

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +56 -0
  2. config.json +18 -0
  3. model.safetensors +3 -0
  4. modeling_repo_jepa.py +138 -0
README.md CHANGED
@@ -1,3 +1,59 @@
1
  ---
 
 
 
 
 
 
2
  license: mit
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: en
3
+ tags:
4
+ - code
5
+ - semantic-search
6
+ - jepa
7
+ - code-search
8
  license: mit
9
+ datasets:
10
+ - claudios/code_search_net
11
+ metrics:
12
+ - mrr
13
  ---
14
+
15
+ # Repo-JEPA: Semantic Code Navigator (SOTA 0.90 MRR)
16
+
17
+ A **Joint Embedding Predictive Architecture** (JEPA) for semantic code search, trained on 411,000 real Python functions using an NVIDIA H100.
18
+
19
+ ## 🏆 Performance
20
+
21
+ Tested on 1,000 unseen real-world Python functions from CodeSearchNet.
22
+
23
+ | Metric | Result | Target |
24
+ |--------|--------|--------|
25
+ | **MRR** | **0.9052** | 0.60 |
26
+ | **Hits@1** | **86.2%** | - |
27
+ | **Hits@5** | **95.9%** | - |
28
+ | **Hits@10** | **97.3%** | - |
29
+ | **Median Rank** | **1.0** | - |
30
+
31
+ ## 🧩 Usage (AutoModel)
32
+
33
+ ```python
34
+ from transformers import AutoModel, AutoTokenizer
35
+
36
+ # 1. Load Model
37
+ model = AutoModel.from_pretrained("uddeshya-k/RepoJepa", trust_remote_code=True)
38
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
39
+
40
+
41
+ # 2. Encode Code
42
+ code = "def handle_login(user): return auth.verify(user)"
43
+ code_embed = model.encode_code(**tokenizer(code, return_tensors="pt"))
44
+
45
+ # 3. Encode Query
46
+ query = "how to authenticate users?"
47
+ query_embed = model.encode_query(**tokenizer(query, return_tensors="pt"))
48
+
49
+ # 4. Search
50
+ similarity = (code_embed @ query_embed.T).item()
51
+ print(f"Similarity: {similarity:.4f}")
52
+ ```
53
+
54
+ ## 🏗️ Technical Details
55
+
56
+ - **Backbone**: CodeBERT (RoBERTa-style)
57
+ - **Loss**: VICReg (Variance-Invariance-Covariance Regularization)
58
+ - **Hardware**: NVIDIA H100 PCIe (80GB VRAM)
59
+ - **Optimizer**: AdamW + OneCycleLR
config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "repo-jepa",
3
+ "architectures": ["RepoJEPAModel"],
4
+ "hidden_dim": 768,
5
+ "num_encoder_layers": 12,
6
+ "num_attention_heads": 12,
7
+ "intermediate_dim": 3072,
8
+ "hidden_dropout_prob": 0.1,
9
+ "attention_dropout_prob": 0.1,
10
+ "vocab_size": 50265,
11
+ "max_seq_len": 512,
12
+ "pad_token_id": 1,
13
+ "base_model": "microsoft/codebert-base",
14
+ "auto_map": {
15
+ "AutoConfig": "modeling_repo_jepa.RepoJEPAConfig",
16
+ "AutoModel": "modeling_repo_jepa.RepoJEPAModel"
17
+ }
18
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf68c7c31f799d637010f3bebe71280e1c19a16e275c8584476866aa95813db
3
+ size 1006717512
modeling_repo_jepa.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Export for Repo-JEPA
3
+
4
+ This file enables loading Repo-JEPA with AutoModel.from_pretrained()
5
+ using trust_remote_code=True.
6
+ """
7
+
8
+ import copy
9
+ from typing import Optional, Tuple
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from transformers import PretrainedConfig, PreTrainedModel, RobertaModel
14
+
15
+
16
+ class RepoJEPAConfig(PretrainedConfig):
17
+ """Configuration for Repo-JEPA model."""
18
+
19
+ model_type = "repo-jepa"
20
+
21
+ def __init__(
22
+ self,
23
+ hidden_dim: int = 768,
24
+ num_encoder_layers: int = 12,
25
+ num_attention_heads: int = 12,
26
+ intermediate_dim: int = 3072,
27
+ hidden_dropout_prob: float = 0.1,
28
+ attention_dropout_prob: float = 0.1,
29
+ vocab_size: int = 50265,
30
+ max_seq_len: int = 512,
31
+ pad_token_id: int = 1,
32
+ base_model: str = "microsoft/codebert-base",
33
+ **kwargs,
34
+ ):
35
+ super().__init__(**kwargs)
36
+ self.hidden_dim = hidden_dim
37
+ self.num_encoder_layers = num_encoder_layers
38
+ self.num_attention_heads = num_attention_heads
39
+ self.intermediate_dim = intermediate_dim
40
+ self.hidden_dropout_prob = hidden_dropout_prob
41
+ self.attention_dropout_prob = attention_dropout_prob
42
+ self.vocab_size = vocab_size
43
+ self.max_seq_len = max_seq_len
44
+ self.pad_token_id = pad_token_id
45
+ self.base_model = base_model
46
+
47
+
48
+ class ProjectionHead(nn.Module):
49
+ """MLP projection head."""
50
+
51
+ def __init__(self, input_dim: int, output_dim: int):
52
+ super().__init__()
53
+ self.layers = nn.Sequential(
54
+ nn.Linear(input_dim, output_dim),
55
+ nn.BatchNorm1d(output_dim),
56
+ nn.ReLU(inplace=True),
57
+ nn.Linear(output_dim, output_dim),
58
+ nn.BatchNorm1d(output_dim),
59
+ nn.ReLU(inplace=True),
60
+ nn.Linear(output_dim, output_dim),
61
+ )
62
+
63
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
64
+ return self.layers(x)
65
+
66
+
67
+ class RepoJEPAModel(PreTrainedModel):
68
+ """
69
+ Repo-JEPA: Joint Embedding Predictive Architecture for Code Search.
70
+
71
+ Use for semantic code search (encode_code) and retrieval queries (encode_query).
72
+ """
73
+
74
+ config_class = RepoJEPAConfig
75
+
76
+ def __init__(self, config: RepoJEPAConfig):
77
+ super().__init__(config)
78
+
79
+ # In the HF model, we store both encoders
80
+ self.context_encoder = RobertaModel.from_pretrained(
81
+ config.base_model,
82
+ add_pooling_layer=False,
83
+ )
84
+ self.target_encoder = RobertaModel.from_pretrained(
85
+ config.base_model,
86
+ add_pooling_layer=False,
87
+ )
88
+
89
+ # Projection heads
90
+ hidden_size = self.context_encoder.config.hidden_size
91
+ self.context_projector = ProjectionHead(hidden_size, config.hidden_dim)
92
+ self.target_projector = ProjectionHead(hidden_size, config.hidden_dim)
93
+
94
+ self.post_init()
95
+
96
+ def encode_code(
97
+ self,
98
+ input_ids: torch.Tensor,
99
+ attention_mask: Optional[torch.Tensor] = None,
100
+ ) -> torch.Tensor:
101
+ """Encode code snippet into embedding space."""
102
+ outputs = self.context_encoder(input_ids=input_ids, attention_mask=attention_mask)
103
+ pooled = self._mean_pool(outputs.last_hidden_state, attention_mask)
104
+ return self.context_projector(pooled)
105
+
106
+ def encode_query(
107
+ self,
108
+ input_ids: torch.Tensor,
109
+ attention_mask: Optional[torch.Tensor] = None,
110
+ ) -> torch.Tensor:
111
+ """Encode search query (docstring) into embedding space."""
112
+ outputs = self.target_encoder(input_ids=input_ids, attention_mask=attention_mask)
113
+ pooled = self._mean_pool(outputs.last_hidden_state, attention_mask)
114
+ return self.target_projector(pooled)
115
+
116
+ def _mean_pool(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
117
+ if attention_mask is not None:
118
+ mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
119
+ sum_hidden = torch.sum(hidden_states * mask, dim=1)
120
+ sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
121
+ return sum_hidden / sum_mask
122
+ return hidden_states.mean(dim=1)
123
+
124
+ def forward(self, **kwargs):
125
+ # HF requires forward(), we default to code encoding or raise error
126
+ if "input_ids" in kwargs:
127
+ return self.encode_code(kwargs["input_ids"], kwargs.get("attention_mask"))
128
+ raise NotImplementedError("Use .encode_code() or .encode_query() specifically.")
129
+
130
+
131
+ # Register with Auto classes
132
+ try:
133
+ from transformers import AutoConfig, AutoModel
134
+ AutoConfig.register("repo-jepa", RepoJEPAConfig)
135
+ AutoModel.register(RepoJEPAConfig, RepoJEPAModel)
136
+ except:
137
+ pass
138
+