faisalmumtaz commited on
Commit
3a7f436
·
verified ·
1 Parent(s): f60b6ec

Fix datasets metadata: CoRNStack, StackOverflow, CodeSearchNet

Browse files
Files changed (1) hide show
  1. README.md +28 -28
README.md CHANGED
@@ -13,8 +13,9 @@ tags:
13
  - feature-extraction
14
  - sentence-transformers
15
  datasets:
16
- - CoIR-Retrieval/CodeSearchNet-python
17
- - bigcode/the-stack
 
18
  pipeline_tag: feature-extraction
19
  base_model: Qwen/Qwen2.5-Coder-0.5B
20
  model-index:
@@ -110,7 +111,8 @@ model.eval()
110
  def encode(texts, is_query=False):
111
  # Add instruction prefix for queries
112
  if is_query:
113
- texts = [f"Instruct: Find the most relevant code snippet given the following query:\nQuery: {{t}}" for t in texts]
 
114
 
115
  inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
116
 
@@ -130,9 +132,12 @@ def encode(texts, is_query=False):
130
  # Example: Code Search
131
  query = "How to sort a list in Python"
132
  code_snippets = [
133
- "def sort_list(lst):\n return sorted(lst)",
134
- "def add_numbers(a, b):\n return a + b",
135
- "def reverse_string(s):\n return s[::-1]",
 
 
 
136
  ]
137
 
138
  query_emb = encode([query], is_query=True)
@@ -140,18 +145,9 @@ code_embs = encode(code_snippets, is_query=False)
140
 
141
  # Compute similarities
142
  similarities = (query_emb @ code_embs.T).squeeze()
143
- print(f"Query: {{query}}")
144
  for i, (code, sim) in enumerate(zip(code_snippets, similarities)):
145
- print(f" [{{sim:.4f}}] {{code[:50]}}...")
146
- ```
147
-
148
- ### With Sentence Transformers (Coming Soon)
149
-
150
- ```python
151
- from sentence_transformers import SentenceTransformer
152
-
153
- model = SentenceTransformer("faisalmumtaz/codecompass-embed")
154
- embeddings = model.encode(["def hello(): print('world')"])
155
  ```
156
 
157
  ## Instruction Templates
@@ -160,10 +156,14 @@ For optimal performance, use these instruction prefixes for queries:
160
 
161
  | Task | Instruction Template |
162
  |------|---------------------|
163
- | NL → Code | `Instruct: Find the most relevant code snippet given the following query:\nQuery: {{query}}` |
164
- | Code → Code | `Instruct: Find an equivalent code snippet given the following code snippet:\nQuery: {{query}}` |
165
- | Tech Q&A | `Instruct: Find the most relevant answer given the following question:\nQuery: {{query}}` |
166
- | Text → SQL | `Instruct: Given a natural language question and schema, find the corresponding SQL query:\nQuery: {{query}}` |
 
 
 
 
167
 
168
  **Note**: Document/corpus texts do NOT need instruction prefixes.
169
 
@@ -188,13 +188,13 @@ For optimal performance, use these instruction prefixes for queries:
188
  ## Citation
189
 
190
  ```bibtex
191
- @misc{{codecompass2026,
192
- author = {{Faisal Mumtaz}},
193
- title = {{CodeCompass-Embed: A Code Embedding Model for Semantic Code Search}},
194
- year = {{2026}},
195
- publisher = {{Hugging Face}},
196
- url = {{https://huggingface.co/faisalmumtaz/codecompass-embed}}
197
- }}
198
  ```
199
 
200
  ## License
 
13
  - feature-extraction
14
  - sentence-transformers
15
  datasets:
16
+ - code-rag-bench/cornstack
17
+ - bigcode/stackoverflow
18
+ - code_search_net
19
  pipeline_tag: feature-extraction
20
  base_model: Qwen/Qwen2.5-Coder-0.5B
21
  model-index:
 
111
  def encode(texts, is_query=False):
112
  # Add instruction prefix for queries
113
  if is_query:
114
+ texts = [f"Instruct: Find the most relevant code snippet given the following query:
115
+ Query: {t}" for t in texts]
116
 
117
  inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
118
 
 
132
  # Example: Code Search
133
  query = "How to sort a list in Python"
134
  code_snippets = [
135
+ "def sort_list(lst):
136
+ return sorted(lst)",
137
+ "def add_numbers(a, b):
138
+ return a + b",
139
+ "def reverse_string(s):
140
+ return s[::-1]",
141
  ]
142
 
143
  query_emb = encode([query], is_query=True)
 
145
 
146
  # Compute similarities
147
  similarities = (query_emb @ code_embs.T).squeeze()
148
+ print(f"Query: {query}")
149
  for i, (code, sim) in enumerate(zip(code_snippets, similarities)):
150
+ print(f" [{sim:.4f}] {code[:50]}...")
 
 
 
 
 
 
 
 
 
151
  ```
152
 
153
  ## Instruction Templates
 
156
 
157
  | Task | Instruction Template |
158
  |------|---------------------|
159
+ | NL → Code | `Instruct: Find the most relevant code snippet given the following query:
160
+ Query: {query}` |
161
+ | Code Code | `Instruct: Find an equivalent code snippet given the following code snippet:
162
+ Query: {query}` |
163
+ | Tech Q&A | `Instruct: Find the most relevant answer given the following question:
164
+ Query: {query}` |
165
+ | Text → SQL | `Instruct: Given a natural language question and schema, find the corresponding SQL query:
166
+ Query: {query}` |
167
 
168
  **Note**: Document/corpus texts do NOT need instruction prefixes.
169
 
 
188
  ## Citation
189
 
190
  ```bibtex
191
+ @misc{codecompass2026,
192
+ author = {Faisal Mumtaz},
193
+ title = {CodeCompass-Embed: A Code Embedding Model for Semantic Code Search},
194
+ year = {2026},
195
+ publisher = {Hugging Face},
196
+ url = {https://huggingface.co/faisalmumtaz/codecompass-embed}
197
+ }
198
  ```
199
 
200
  ## License