Fix datasets metadata: CoRNStack, StackOverflow, CodeSearchNet
Browse files
README.md
CHANGED
|
@@ -13,8 +13,9 @@ tags:
|
|
| 13 |
- feature-extraction
|
| 14 |
- sentence-transformers
|
| 15 |
datasets:
|
| 16 |
-
-
|
| 17 |
-
- bigcode/
|
|
|
|
| 18 |
pipeline_tag: feature-extraction
|
| 19 |
base_model: Qwen/Qwen2.5-Coder-0.5B
|
| 20 |
model-index:
|
|
@@ -110,7 +111,8 @@ model.eval()
|
|
| 110 |
def encode(texts, is_query=False):
|
| 111 |
# Add instruction prefix for queries
|
| 112 |
if is_query:
|
| 113 |
-
texts = [f"Instruct: Find the most relevant code snippet given the following query
|
|
|
|
| 114 |
|
| 115 |
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 116 |
|
|
@@ -130,9 +132,12 @@ def encode(texts, is_query=False):
|
|
| 130 |
# Example: Code Search
|
| 131 |
query = "How to sort a list in Python"
|
| 132 |
code_snippets = [
|
| 133 |
-
"def sort_list(lst)
|
| 134 |
-
|
| 135 |
-
"def
|
|
|
|
|
|
|
|
|
|
| 136 |
]
|
| 137 |
|
| 138 |
query_emb = encode([query], is_query=True)
|
|
@@ -140,18 +145,9 @@ code_embs = encode(code_snippets, is_query=False)
|
|
| 140 |
|
| 141 |
# Compute similarities
|
| 142 |
similarities = (query_emb @ code_embs.T).squeeze()
|
| 143 |
-
print(f"Query: {
|
| 144 |
for i, (code, sim) in enumerate(zip(code_snippets, similarities)):
|
| 145 |
-
print(f" [{
|
| 146 |
-
```
|
| 147 |
-
|
| 148 |
-
### With Sentence Transformers (Coming Soon)
|
| 149 |
-
|
| 150 |
-
```python
|
| 151 |
-
from sentence_transformers import SentenceTransformer
|
| 152 |
-
|
| 153 |
-
model = SentenceTransformer("faisalmumtaz/codecompass-embed")
|
| 154 |
-
embeddings = model.encode(["def hello(): print('world')"])
|
| 155 |
```
|
| 156 |
|
| 157 |
## Instruction Templates
|
|
@@ -160,10 +156,14 @@ For optimal performance, use these instruction prefixes for queries:
|
|
| 160 |
|
| 161 |
| Task | Instruction Template |
|
| 162 |
|------|---------------------|
|
| 163 |
-
| NL → Code | `Instruct: Find the most relevant code snippet given the following query
|
| 164 |
-
|
| 165 |
-
|
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
**Note**: Document/corpus texts do NOT need instruction prefixes.
|
| 169 |
|
|
@@ -188,13 +188,13 @@ For optimal performance, use these instruction prefixes for queries:
|
|
| 188 |
## Citation
|
| 189 |
|
| 190 |
```bibtex
|
| 191 |
-
@misc{
|
| 192 |
-
author = {
|
| 193 |
-
title = {
|
| 194 |
-
year = {
|
| 195 |
-
publisher = {
|
| 196 |
-
url = {
|
| 197 |
-
}
|
| 198 |
```
|
| 199 |
|
| 200 |
## License
|
|
|
|
| 13 |
- feature-extraction
|
| 14 |
- sentence-transformers
|
| 15 |
datasets:
|
| 16 |
+
- code-rag-bench/cornstack
|
| 17 |
+
- bigcode/stackoverflow
|
| 18 |
+
- code_search_net
|
| 19 |
pipeline_tag: feature-extraction
|
| 20 |
base_model: Qwen/Qwen2.5-Coder-0.5B
|
| 21 |
model-index:
|
|
|
|
| 111 |
def encode(texts, is_query=False):
|
| 112 |
# Add instruction prefix for queries
|
| 113 |
if is_query:
|
| 114 |
+
texts = [f"Instruct: Find the most relevant code snippet given the following query:
|
| 115 |
+
Query: {t}" for t in texts]
|
| 116 |
|
| 117 |
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 118 |
|
|
|
|
| 132 |
# Example: Code Search
|
| 133 |
query = "How to sort a list in Python"
|
| 134 |
code_snippets = [
|
| 135 |
+
"def sort_list(lst):
|
| 136 |
+
return sorted(lst)",
|
| 137 |
+
"def add_numbers(a, b):
|
| 138 |
+
return a + b",
|
| 139 |
+
"def reverse_string(s):
|
| 140 |
+
return s[::-1]",
|
| 141 |
]
|
| 142 |
|
| 143 |
query_emb = encode([query], is_query=True)
|
|
|
|
| 145 |
|
| 146 |
# Compute similarities
|
| 147 |
similarities = (query_emb @ code_embs.T).squeeze()
|
| 148 |
+
print(f"Query: {query}")
|
| 149 |
for i, (code, sim) in enumerate(zip(code_snippets, similarities)):
|
| 150 |
+
print(f" [{sim:.4f}] {code[:50]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
```
|
| 152 |
|
| 153 |
## Instruction Templates
|
|
|
|
| 156 |
|
| 157 |
| Task | Instruction Template |
|
| 158 |
|------|---------------------|
|
| 159 |
+
| NL → Code | `Instruct: Find the most relevant code snippet given the following query:
|
| 160 |
+
Query: {query}` |
|
| 161 |
+
| Code → Code | `Instruct: Find an equivalent code snippet given the following code snippet:
|
| 162 |
+
Query: {query}` |
|
| 163 |
+
| Tech Q&A | `Instruct: Find the most relevant answer given the following question:
|
| 164 |
+
Query: {query}` |
|
| 165 |
+
| Text → SQL | `Instruct: Given a natural language question and schema, find the corresponding SQL query:
|
| 166 |
+
Query: {query}` |
|
| 167 |
|
| 168 |
**Note**: Document/corpus texts do NOT need instruction prefixes.
|
| 169 |
|
|
|
|
| 188 |
## Citation
|
| 189 |
|
| 190 |
```bibtex
|
| 191 |
+
@misc{codecompass2026,
|
| 192 |
+
author = {Faisal Mumtaz},
|
| 193 |
+
title = {CodeCompass-Embed: A Code Embedding Model for Semantic Code Search},
|
| 194 |
+
year = {2026},
|
| 195 |
+
publisher = {Hugging Face},
|
| 196 |
+
url = {https://huggingface.co/faisalmumtaz/codecompass-embed}
|
| 197 |
+
}
|
| 198 |
```
|
| 199 |
|
| 200 |
## License
|