Spaces:
Sleeping
Sleeping
| """ | |
| PropertyRetriever class for finding similar properties based on code embeddings. | |
| It loads a dataset of properties, computes embeddings for their critical code sections, | |
| and provides a method to retrieve the most similar property given a new code snippet. | |
| """ | |
| # ! Current data contains properties from contracts.json, making it more likely to find a exact match | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import normalize | |
| from data.data_loader import DEFAULT_CSV_PATH | |
| from dotenv import dotenv_values | |
| SIMILARITY_THRESHOLD = 0.8 # Adjust as needed based on validation | |
| # ------------------------------------------------------------------- | |
| # 1. Load the dataset and build the vector database (offline/once) | |
| # ------------------------------------------------------------------- | |
| class PropertyRetriever: | |
| def __init__(self): | |
| """ | |
| csv_path : path to the CSV file containing the columns: | |
| SpecHash, SpecIndex, Type, Name, StartLine, EndLine, | |
| MethodsInRule, RuleContent, RelatedFunctions, | |
| FunctionBodies, FilePath, Project, ContractCode, | |
| StateVarAssignment, RuleContentNL, Funcitonality | |
| similarity_threshold : minimum dot product to consider a match | |
| """ | |
| self.df = pd.read_csv(DEFAULT_CSV_PATH) | |
| self.threshold = SIMILARITY_THRESHOLD | |
| self.embedder = None | |
| def load_model(self): | |
| """Use a lightweight, open‑source embedding model.""" | |
| if self.embedder is not None: | |
| from sentence_transformers import SentenceTransformer | |
| self.embedder = SentenceTransformer( | |
| 'all-MiniLM-L6-v2', | |
| use_auth_token=dotenv_values(".env").get('HF_TOKEN', '') | |
| ) | |
| # Extract "critical code" from each property (use FunctionBodies) | |
| # Fallback to RelatedFunctions or RuleContent if FunctionBodies is missing | |
| self.critical_codes = [] | |
| for idx, row in self.df.iterrows(): | |
| code = row.get('FunctionBodies', '') | |
| if pd.isna(code) or code.strip() == '': | |
| # Fallback: concatenate RelatedFunctions or use RuleContent | |
| code = row.get('RelatedFunctions', '') | |
| if pd.isna(code) or code.strip() == '': | |
| code = row.get('RuleContent', '') | |
| self.critical_codes.append(str(code)) | |
| # Compute embeddings for all critical codes | |
| self.embeddings = self.embedder.encode(self.critical_codes, show_progress_bar=True) #type: ignore | |
| # Normalize for dot product = cosine similarity | |
| self.embeddings = normalize(self.embeddings, norm='l2') | |
| def get_similar_property(self, input_code: str) -> str: | |
| """ | |
| Given a Solidity function code string, return the most similar property | |
| (RuleContent) from the dataset, or an empty string if none exceeds the threshold. | |
| """ | |
| if not input_code or not isinstance(input_code, str): | |
| return "" | |
| # Step ②: Embed the subject code | |
| query_emb = self.embedder.encode([input_code]) #type: ignore | |
| query_emb = normalize(query_emb, norm='l2') | |
| # Step ③: Compute dot products with all database vectors | |
| similarities = np.dot(self.embeddings, query_emb.T).flatten() | |
| # Find the best match above threshold | |
| best_idx = np.argmax(similarities) | |
| best_score = similarities[best_idx] | |
| if best_score >= self.threshold: | |
| # Return the property content (RuleContent) of the best match | |
| return self.df.iloc[best_idx]['RuleContentNL'] | |
| else: | |
| # No sufficiently similar property found | |
| return "" |