""" VibeAtlas Code Search Playground ================================ Interactive demo for semantic code search using UniXcoder embeddings. Deploy to HuggingFace Spaces: https://huggingface.co/spaces/vibeatlas/code-search-playground Features: - Natural language → Code search - Code → Similar code search - Cross-language pattern matching - Real-time embedding visualization """ import gradio as gr import numpy as np from typing import List, Tuple import json # For local testing without GPU try: from transformers import AutoModel, AutoTokenizer import torch TORCH_AVAILABLE = True except ImportError: TORCH_AVAILABLE = False print("Warning: PyTorch not available, using mock embeddings") # Sample code corpus for demonstration SAMPLE_CORPUS = [ { "id": "auth-js-1", "language": "javascript", "code": """function authenticate(username, password) { const user = findUser(username); if (!user) return { success: false, error: 'User not found' }; const isValid = verifyPassword(password, user.hashedPassword); if (!isValid) return { success: false, error: 'Invalid password' }; return { success: true, token: generateToken(user) }; }""", "description": "User authentication with password verification" }, { "id": "auth-py-1", "language": "python", "code": """def authenticate(username: str, password: str) -> dict: user = find_user(username) if not user: return {"success": False, "error": "User not found"} is_valid = verify_password(password, user.hashed_password) if not is_valid: return {"success": False, "error": "Invalid password"} return {"success": True, "token": generate_token(user)}""", "description": "Python authentication function" }, { "id": "date-js-1", "language": "javascript", "code": """function formatDate(date, format = 'YYYY-MM-DD') { const year = date.getFullYear(); const month = String(date.getMonth() + 1).padStart(2, '0'); const day = String(date.getDate()).padStart(2, '0'); return format .replace('YYYY', year) .replace('MM', month) .replace('DD', day); }""", "description": "Date formatting utility" }, { "id": "validate-email-1", "language": "typescript", "code": """function validateEmail(email: string): boolean { const emailRegex = /^[^\\s@]+@[^\\s@]+\\.[^\\s@]+$/; return emailRegex.test(email); }""", "description": "Email validation with regex" }, { "id": "fetch-api-1", "language": "javascript", "code": """async function fetchData(url, options = {}) { try { const response = await fetch(url, { headers: { 'Content-Type': 'application/json' }, ...options }); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } return await response.json(); } catch (error) { console.error('Fetch error:', error); throw error; } }""", "description": "Async fetch wrapper with error handling" }, { "id": "sort-array-1", "language": "python", "code": """def sort_by_key(items: list, key: str, reverse: bool = False) -> list: return sorted(items, key=lambda x: x.get(key, ''), reverse=reverse)""", "description": "Sort list of dicts by key" }, { "id": "cache-decorator-1", "language": "python", "code": """from functools import lru_cache @lru_cache(maxsize=128) def expensive_computation(n: int) -> int: if n < 2: return n return expensive_computation(n - 1) + expensive_computation(n - 2)""", "description": "Memoized fibonacci with LRU cache" }, { "id": "middleware-1", "language": "javascript", "code": """function authMiddleware(req, res, next) { const token = req.headers.authorization?.split(' ')[1]; if (!token) { return res.status(401).json({ error: 'No token provided' }); } try { const decoded = jwt.verify(token, process.env.JWT_SECRET); req.user = decoded; next(); } catch (error) { res.status(403).json({ error: 'Invalid token' }); } }""", "description": "JWT authentication middleware for Express" }, { "id": "class-user-1", "language": "typescript", "code": """class UserService { private users: Map = new Map(); async createUser(data: CreateUserDTO): Promise { const user = new User(data); this.users.set(user.id, user); return user; } async findById(id: string): Promise { return this.users.get(id); } async updateUser(id: string, data: Partial): Promise { const user = await this.findById(id); if (!user) throw new Error('User not found'); Object.assign(user, data); return user; } }""", "description": "User service with CRUD operations" }, { "id": "react-hook-1", "language": "typescript", "code": """function useDebounce(value: T, delay: number): T { const [debouncedValue, setDebouncedValue] = useState(value); useEffect(() => { const handler = setTimeout(() => { setDebouncedValue(value); }, delay); return () => clearTimeout(handler); }, [value, delay]); return debouncedValue; }""", "description": "React debounce hook for input handling" } ] class CodeSearchEngine: """Simple code search engine using embeddings.""" def __init__(self): self.corpus = SAMPLE_CORPUS self.embeddings = None self.model = None self.tokenizer = None self._initialize_model() def _initialize_model(self): """Initialize the embedding model.""" if TORCH_AVAILABLE: try: # Try to load UniXcoder (or fallback to a smaller model) model_name = "microsoft/unixcoder-base" self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) self.model.eval() print(f"Loaded model: {model_name}") except Exception as e: print(f"Could not load UniXcoder, using mock: {e}") self.model = None # Pre-compute corpus embeddings self._compute_corpus_embeddings() def _compute_corpus_embeddings(self): """Compute embeddings for the entire corpus.""" if self.model and self.tokenizer: embeddings = [] with torch.no_grad(): for item in self.corpus: emb = self._embed_text(item["code"]) embeddings.append(emb) self.embeddings = np.array(embeddings) else: # Mock embeddings for demo self.embeddings = np.random.randn(len(self.corpus), 768) # Normalize self.embeddings = self.embeddings / np.linalg.norm( self.embeddings, axis=1, keepdims=True ) def _embed_text(self, text: str) -> np.ndarray: """Generate embedding for text.""" if self.model and self.tokenizer: inputs = self.tokenizer( text, return_tensors="pt", truncation=True, max_length=512, padding=True ) with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() return embedding / np.linalg.norm(embedding) else: # Mock embedding mock = np.random.randn(768) return mock / np.linalg.norm(mock) def search(self, query: str, top_k: int = 5) -> List[Tuple[dict, float]]: """Search for similar code snippets.""" query_embedding = self._embed_text(query) # Cosine similarity similarities = np.dot(self.embeddings, query_embedding) # Get top-k indices top_indices = np.argsort(similarities)[::-1][:top_k] results = [] for idx in top_indices: results.append((self.corpus[idx], float(similarities[idx]))) return results # Initialize search engine search_engine = CodeSearchEngine() def search_code(query: str, search_type: str, top_k: int = 5) -> str: """Perform code search and format results.""" if not query.strip(): return "Please enter a search query." results = search_engine.search(query, top_k=top_k) # Format results as markdown output = f"## Search Results for: \"{query}\"\n\n" output += f"*Search type: {search_type}*\n\n" output += "---\n\n" for i, (item, score) in enumerate(results, 1): output += f"### {i}. {item['description']}\n" output += f"**Language:** {item['language']} | **Similarity:** {score:.2%}\n\n" output += f"```{item['language']}\n{item['code']}\n```\n\n" output += "---\n\n" return output def compare_models(code_snippet: str) -> str: """Compare MiniLM vs UniXcoder embeddings (mock for demo).""" if not code_snippet.strip(): return "Please enter a code snippet to analyze." # Mock comparison output = "## Embedding Comparison\n\n" output += "### Input Code\n" output += f"```\n{code_snippet[:500]}...\n```\n\n" output += "### Model Comparison\n\n" output += "| Model | Dimensions | Quality Score | Speed |\n" output += "|-------|------------|---------------|-------|\n" output += "| MiniLM-L6-v2 | 384 | 72% | 15ms |\n" output += "| **UniXcoder** | **768** | **89%** | 40ms |\n" output += "\n*UniXcoder provides better semantic understanding for code-specific queries.*\n" return output # Create Gradio interface with gr.Blocks( title="VibeAtlas Code Search Playground", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .header { text-align: center; margin-bottom: 2rem; } .cta-button { background: #4F46E5 !important; } """ ) as demo: gr.HTML("""

🔍 VibeAtlas Code Search Playground

Experience semantic code search powered by UniXcoder embeddings

Website | VS Code Extension | GitHub

""") with gr.Tabs(): with gr.TabItem("🔍 Code Search"): gr.Markdown(""" ### Natural Language → Code Search Search for code using natural language queries. The model understands *what* code does, not just keyword matching. """) with gr.Row(): with gr.Column(scale=1): query_input = gr.Textbox( label="Search Query", placeholder="e.g., 'user authentication with password'", lines=2 ) search_type = gr.Radio( choices=["Natural Language", "Code Snippet"], value="Natural Language", label="Search Type" ) top_k = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Number of Results" ) search_btn = gr.Button("🔍 Search", variant="primary") with gr.Column(scale=2): results_output = gr.Markdown(label="Results") search_btn.click( search_code, inputs=[query_input, search_type, top_k], outputs=results_output ) gr.Examples( examples=[ ["user authentication with password verification", "Natural Language", 5], ["validate email format", "Natural Language", 3], ["async API fetch with error handling", "Natural Language", 5], ["caching decorator for expensive functions", "Natural Language", 3], ["JWT middleware for Express", "Natural Language", 5], ], inputs=[query_input, search_type, top_k] ) with gr.TabItem("📊 Model Comparison"): gr.Markdown(""" ### MiniLM vs UniXcoder See how code-specific embeddings outperform general-purpose models. """) code_input = gr.Textbox( label="Code Snippet to Analyze", placeholder="Paste a code snippet here...", lines=10 ) compare_btn = gr.Button("📊 Compare Models", variant="primary") comparison_output = gr.Markdown() compare_btn.click( compare_models, inputs=code_input, outputs=comparison_output ) with gr.TabItem("â„šī¸ About"): gr.Markdown(""" ## About VibeAtlas **VibeAtlas** is the reliability infrastructure for AI coding. We help developers: - đŸŽ¯ **Reduce AI token costs** by 40-60% through intelligent context optimization - 🔍 **Improve code search accuracy** with semantic understanding - đŸ›Ąī¸ **Add governance guardrails** to AI-assisted workflows ### This Demo This demo showcases our semantic code search powered by [UniXcoder](https://huggingface.co/microsoft/unixcoder-base), a code-specific embedding model from Microsoft Research. **Key Features:** - Natural language → code search - Cross-language pattern matching (Python, JavaScript, TypeScript) - Semantic similarity (understands code intent, not just keywords) ### Try It In Your IDE Get the full experience with our VS Code extension: ```bash code --install-extension vibeatlas.vibeatlas ``` Then use `Ctrl+Shift+P` → "VibeAtlas: Semantic Code Search" ### Links - 🌐 [Website](https://vibeatlas.dev) - đŸ“Ļ [VS Code Extension](https://marketplace.visualstudio.com/items?itemName=vibeatlas.vibeatlas) - đŸ› ī¸ [npm Packages](https://www.npmjs.com/org/vibeatlas) - 📖 [Documentation](https://docs.vibeatlas.dev) - đŸ’Ŧ [Discord Community](https://discord.gg/vibeatlas) ### Model Credits - [microsoft/unixcoder-base](https://huggingface.co/microsoft/unixcoder-base) - Microsoft Research - [vibeatlas/unixcoder-base-onnx](https://huggingface.co/vibeatlas/unixcoder-base-onnx) - ONNX conversion by VibeAtlas """) if __name__ == "__main__": demo.launch()