Simplify README: single benchmark table, factual highlights
Browse files
README.md
CHANGED
|
@@ -41,9 +41,6 @@ model-index:
|
|
| 41 |
- type: ndcg@10
|
| 42 |
value: 0.9228
|
| 43 |
name: NDCG@10
|
| 44 |
-
- type: mrr@10
|
| 45 |
-
value: 0.9106
|
| 46 |
-
name: MRR@10
|
| 47 |
---
|
| 48 |
|
| 49 |
# CodeCompass-Embed
|
|
@@ -90,91 +87,68 @@ Evaluated on the [CoIR Benchmark](https://github.com/CoIR-team/coir) (NDCG@10).
|
|
| 90 |
|
| 91 |
## Usage
|
| 92 |
|
| 93 |
-
### With Transformers
|
| 94 |
-
|
| 95 |
```python
|
| 96 |
import torch
|
| 97 |
import torch.nn.functional as F
|
| 98 |
from transformers import AutoModel, AutoTokenizer
|
| 99 |
|
| 100 |
-
# Load model
|
| 101 |
model = AutoModel.from_pretrained("faisalmumtaz/codecompass-embed", trust_remote_code=True)
|
| 102 |
tokenizer = AutoTokenizer.from_pretrained("faisalmumtaz/codecompass-embed")
|
| 103 |
|
| 104 |
-
#
|
| 105 |
for layer in model.layers:
|
| 106 |
layer.self_attn.is_causal = False
|
| 107 |
|
| 108 |
model.eval()
|
| 109 |
|
| 110 |
def encode(texts, is_query=False):
|
| 111 |
-
# Add instruction prefix for queries
|
| 112 |
if is_query:
|
| 113 |
-
texts = [f"Instruct: Find the most relevant code snippet given the following query
|
|
|
|
| 114 |
|
| 115 |
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 116 |
|
| 117 |
with torch.no_grad():
|
| 118 |
outputs = model(**inputs, output_hidden_states=True)
|
| 119 |
hidden = outputs.hidden_states[-1]
|
| 120 |
-
|
| 121 |
-
# Mean pooling
|
| 122 |
mask = inputs["attention_mask"].unsqueeze(-1).float()
|
| 123 |
embeddings = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
|
| 124 |
-
|
| 125 |
-
# L2 normalize
|
| 126 |
embeddings = F.normalize(embeddings, p=2, dim=-1)
|
| 127 |
|
| 128 |
return embeddings
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
"def sort_list(lst):\n return sorted(lst)",
|
| 134 |
-
"def add_numbers(a, b):\n return a + b",
|
| 135 |
-
"def reverse_string(s):\n return s[::-1]",
|
| 136 |
-
]
|
| 137 |
-
|
| 138 |
-
query_emb = encode([query], is_query=True)
|
| 139 |
-
code_embs = encode(code_snippets, is_query=False)
|
| 140 |
-
|
| 141 |
-
# Compute similarities
|
| 142 |
-
similarities = (query_emb @ code_embs.T).squeeze()
|
| 143 |
-
print(f"Query: {query}")
|
| 144 |
-
for i, (code, sim) in enumerate(zip(code_snippets, similarities)):
|
| 145 |
-
print(f" [{sim:.4f}] {code[:50]}...")
|
| 146 |
```
|
| 147 |
|
| 148 |
## Instruction Templates
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
| 153 |
-
|
| 154 |
-
|
|
| 155 |
-
|
| 156 |
-
| Tech Q&A | `Instruct: Find the most relevant answer given the following question
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
- **
|
| 166 |
-
- **
|
| 167 |
-
- **
|
| 168 |
-
- **
|
| 169 |
-
- **Effective Batch Size**: 1024 (via GradCache)
|
| 170 |
-
- **Training Steps**: 950
|
| 171 |
- **Hardware**: NVIDIA H100
|
| 172 |
|
| 173 |
## Limitations
|
| 174 |
|
| 175 |
- Weaker on Q&A style tasks (StackOverflow-QA, CodeFeedback)
|
| 176 |
-
- Trained
|
| 177 |
-
- May not generalize well to low-resource programming languages
|
| 178 |
|
| 179 |
## Citation
|
| 180 |
|
|
|
|
| 41 |
- type: ndcg@10
|
| 42 |
value: 0.9228
|
| 43 |
name: NDCG@10
|
|
|
|
|
|
|
|
|
|
| 44 |
---
|
| 45 |
|
| 46 |
# CodeCompass-Embed
|
|
|
|
| 87 |
|
| 88 |
## Usage
|
| 89 |
|
|
|
|
|
|
|
| 90 |
```python
|
| 91 |
import torch
|
| 92 |
import torch.nn.functional as F
|
| 93 |
from transformers import AutoModel, AutoTokenizer
|
| 94 |
|
|
|
|
| 95 |
model = AutoModel.from_pretrained("faisalmumtaz/codecompass-embed", trust_remote_code=True)
|
| 96 |
tokenizer = AutoTokenizer.from_pretrained("faisalmumtaz/codecompass-embed")
|
| 97 |
|
| 98 |
+
# Enable bidirectional attention
|
| 99 |
for layer in model.layers:
|
| 100 |
layer.self_attn.is_causal = False
|
| 101 |
|
| 102 |
model.eval()
|
| 103 |
|
| 104 |
def encode(texts, is_query=False):
|
|
|
|
| 105 |
if is_query:
|
| 106 |
+
texts = [f"Instruct: Find the most relevant code snippet given the following query:
|
| 107 |
+
Query: {t}" for t in texts]
|
| 108 |
|
| 109 |
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 110 |
|
| 111 |
with torch.no_grad():
|
| 112 |
outputs = model(**inputs, output_hidden_states=True)
|
| 113 |
hidden = outputs.hidden_states[-1]
|
|
|
|
|
|
|
| 114 |
mask = inputs["attention_mask"].unsqueeze(-1).float()
|
| 115 |
embeddings = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
|
|
|
|
|
|
|
| 116 |
embeddings = F.normalize(embeddings, p=2, dim=-1)
|
| 117 |
|
| 118 |
return embeddings
|
| 119 |
|
| 120 |
+
query_emb = encode(["sort a list"], is_query=True)
|
| 121 |
+
code_embs = encode(["def sort(lst): return sorted(lst)"])
|
| 122 |
+
similarity = (query_emb @ code_embs.T).item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
```
|
| 124 |
|
| 125 |
## Instruction Templates
|
| 126 |
|
| 127 |
+
| Task | Template |
|
| 128 |
+
|------|----------|
|
| 129 |
+
| NL to Code | `Instruct: Find the most relevant code snippet given the following query:
|
| 130 |
+
Query: {q}` |
|
| 131 |
+
| Code to Code | `Instruct: Find an equivalent code snippet given the following code snippet:
|
| 132 |
+
Query: {q}` |
|
| 133 |
+
| Tech Q&A | `Instruct: Find the most relevant answer given the following question:
|
| 134 |
+
Query: {q}` |
|
| 135 |
+
| Text to SQL | `Instruct: Given a natural language question and schema, find the corresponding SQL query:
|
| 136 |
+
Query: {q}` |
|
| 137 |
+
|
| 138 |
+
Documents do not need instruction prefixes.
|
| 139 |
+
|
| 140 |
+
## Training
|
| 141 |
+
|
| 142 |
+
- **Data**: 8.8M samples from CoRNStack, StackOverflow, CodeSearchNet
|
| 143 |
+
- **Loss**: InfoNCE (τ=0.05) with 7 hard negatives per sample
|
| 144 |
+
- **Batch Size**: 1024 (via GradCache)
|
| 145 |
+
- **Steps**: 950
|
|
|
|
|
|
|
| 146 |
- **Hardware**: NVIDIA H100
|
| 147 |
|
| 148 |
## Limitations
|
| 149 |
|
| 150 |
- Weaker on Q&A style tasks (StackOverflow-QA, CodeFeedback)
|
| 151 |
+
- Trained on Python/JavaScript/Java/Go/PHP/Ruby
|
|
|
|
| 152 |
|
| 153 |
## Citation
|
| 154 |
|