Spaces:
Sleeping
Sleeping
| from functools import lru_cache | |
| import json | |
| from enum import Enum | |
| import os | |
| import re | |
| from fuzzywuzzy import fuzz | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.neighbors import KNeighborsClassifier | |
| import joblib | |
| from nltk.stem import WordNetLemmatizer | |
| import nltk | |
| from model.db.db_setup import get_db, ExtractedFile | |
| nltk.download('wordnet') | |
| class MatchMethod(Enum): | |
| FUZZY = "fuzzy" | |
| SUBSTRING = "substring" | |
| ML = "ml" | |
| ML_ERROR = "ml_error" | |
| def get_absolute_path(relative_path): | |
| abs_path=os.path.dirname(os.path.abspath(__file__)) | |
| return os.path.join(abs_path, relative_path) | |
| def get_model_path(language): | |
| """Returns the path to the KNN model file based on language.""" | |
| return get_absolute_path(f'model/{language}_knn_model.h5') | |
| def load_functions(language: str) -> dict: | |
| """ | |
| Load functions from database based on language. | |
| Args: | |
| language: The programming language to load functions for. | |
| Returns: | |
| Dictionary of function names and their code. | |
| """ | |
| # Language mapping moved to class constant | |
| LANGUAGE_MAPPING = { | |
| "python": "py", | |
| "javascript": "js", | |
| "typescript": "ts", | |
| "php": "php", | |
| "java": "java" | |
| } | |
| try: | |
| # Get file extension | |
| language = language.lower() | |
| language_ext = LANGUAGE_MAPPING.get(language) | |
| if not language_ext: | |
| raise ValueError(f"Unsupported language: {language}") | |
| # Use context manager for database session | |
| with get_db() as db: | |
| files = ExtractedFile.get_by_extension(db, f".{language_ext}") | |
| # Process files more efficiently | |
| all_functions = {} | |
| for file in files: | |
| data = file.file_data | |
| if isinstance(data, dict): | |
| all_functions.update(data) | |
| elif isinstance(data, list): | |
| all_functions.update({ | |
| k: v for d in data | |
| if isinstance(d, dict) | |
| for k, v in d.items() | |
| }) | |
| return all_functions | |
| except Exception as e: | |
| print(f"Error loading functions from database: {str(e)}") | |
| return {} | |
| def clean_function_name(function_name, language): | |
| """ | |
| Removes language-specific keywords and strips leading/trailing whitespace using regex. | |
| Args: | |
| function_name: The original function name. | |
| language: The programming language of the function. | |
| Returns: | |
| The cleaned function name. | |
| """ | |
| keywords_dict = { | |
| "python": r"\b(def|class|async|await|for|while|if|else|try|except|finally)\b", | |
| "javascript": r"\b(function|class|async|await|const|let|var|if|else|for|while)\b", | |
| "typescript": r"\b(function|class|async|await|const|let|var|interface|type|enum|if|else|for|while)\b", | |
| "php": r"\b(function|class|public|private|protected|if|else|foreach|while|try|catch|finally)\b", | |
| "java": r"\b(class|public|private|protected|static|final|if|else|for|while|try|catch|finally)\b" | |
| } | |
| pattern = keywords_dict.get(language) | |
| if pattern: | |
| # Remove all language-specific keywords using regex | |
| function_name = re.sub(pattern, "", function_name) | |
| return function_name.strip() | |
| def preprocess_text(text): | |
| """Preprocesses the text by converting to lowercase and lemmatizing.""" | |
| lemmatizer = WordNetLemmatizer() | |
| words = text.lower().split() | |
| lemmatized_words = [lemmatizer.lemmatize(word) for word in words] | |
| return " ".join(lemmatized_words) | |
| def find_closest_operation_fuzzy(user_input, operations): | |
| """Finds the closest operation name using fuzzy matching.""" | |
| best_match = None | |
| best_score = 0 | |
| for operation_name in operations: | |
| score = fuzz.ratio(user_input, operation_name) | |
| if score > best_score: | |
| best_score = score | |
| best_match = operation_name | |
| return best_match if best_score >= 70 else None, MatchMethod.FUZZY | |
| def find_closest_operation_substring(user_input, operations): | |
| """Finds the closest operation name using substring matching.""" | |
| for operation_name in operations: | |
| if user_input in operation_name: | |
| return operation_name, MatchMethod.SUBSTRING | |
| return None, MatchMethod.SUBSTRING | |
| def train_ml_model(operations, language): | |
| """Trains a KNN model on operation names.""" | |
| operation_names = list(operations.keys()) | |
| vectorizer = TfidfVectorizer() | |
| X = vectorizer.fit_transform(operation_names) | |
| knn_model = KNeighborsClassifier(n_neighbors=3) | |
| knn_model.fit(X, operation_names) | |
| model_path = get_model_path(language) | |
| joblib.dump((knn_model, vectorizer), model_path) | |
| return knn_model, vectorizer | |
| def load_ml_model(language): | |
| """Loads the trained KNN model and vectorizer.""" | |
| model_path = get_model_path(language) | |
| if os.path.exists(model_path): | |
| return joblib.load(model_path) | |
| else: | |
| return None, None | |
| def predict_operation_name_ml(user_input, knn_model, vectorizer): | |
| """Predicts the operation name using the trained KNN model.""" | |
| input_vector = vectorizer.transform([user_input]) | |
| probabilities = knn_model.predict_proba(input_vector)[0] | |
| max_prob = max(probabilities) | |
| """If the maximum probability is less than 0.5, return an empty string.""" | |
| if max_prob < 0.5: | |
| return "", MatchMethod.ML | |
| return knn_model.predict(input_vector)[0], MatchMethod.ML | |
| def get_operation_definition(user_input, language): | |
| operations = load_functions(language) | |
| cleaned_input = clean_function_name(user_input,language) | |
| preprocessed_input = preprocess_text(cleaned_input) | |
| # First, try exact match | |
| closest_match, method = find_closest_operation_substring(preprocessed_input, operations) | |
| if closest_match: | |
| return operations[closest_match] | |
| # Next, try fuzzy matching | |
| closest_match, method = find_closest_operation_fuzzy(preprocessed_input, operations) | |
| if closest_match: | |
| return operations[closest_match] | |
| # Finally, try ML model | |
| knn_model, vectorizer = load_ml_model(language) | |
| if not knn_model: | |
| knn_model, vectorizer = train_ml_model(operations, language) | |
| try: | |
| closest_match, method = predict_operation_name_ml(preprocessed_input, knn_model, vectorizer) | |
| if closest_match: | |
| return operations[closest_match] | |
| except Exception as e: | |
| return "" | |
| return "" | |