| | from functools import lru_cache |
| | import json |
| | from enum import Enum |
| | import os |
| | import re |
| | from fuzzywuzzy import fuzz |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.neighbors import KNeighborsClassifier |
| | import joblib |
| | from nltk.stem import WordNetLemmatizer |
| | import nltk |
| | from .db.db_setup import get_db, ExtractedFile |
| |
|
| | nltk.download('wordnet') |
| |
|
| | class MatchMethod(Enum): |
| | FUZZY = "fuzzy" |
| | SUBSTRING = "substring" |
| | ML = "ml" |
| | ML_ERROR = "ml_error" |
| |
|
| | def get_absolute_path(relative_path): |
| | abs_path=os.path.dirname(os.path.abspath(__file__)) |
| | return os.path.join(abs_path, relative_path) |
| |
|
| | def get_model_path(language): |
| | """Returns the path to the KNN model file based on language.""" |
| | return get_absolute_path(f'model/{language}_knn_model.h5') |
| |
|
| | def load_functions(language: str) -> dict: |
| | """ |
| | Load functions from database based on language. |
| | |
| | Args: |
| | language: The programming language to load functions for. |
| | Returns: |
| | Dictionary of function names and their code. |
| | """ |
| | |
| | LANGUAGE_MAPPING = { |
| | "python": "py", |
| | "javascript": "js", |
| | "typescript": "ts", |
| | "php": "php", |
| | "java": "java" |
| | } |
| |
|
| | try: |
| | |
| | language = language.lower() |
| | language_ext = LANGUAGE_MAPPING.get(language) |
| | if not language_ext: |
| | raise ValueError(f"Unsupported language: {language}") |
| |
|
| | |
| | with get_db() as db: |
| | files = ExtractedFile.get_by_extension(db, f".{language_ext}") |
| | |
| | |
| | all_functions = {} |
| | for file in files: |
| | data = file.file_data |
| | if isinstance(data, dict): |
| | all_functions.update(data) |
| | elif isinstance(data, list): |
| | all_functions.update({ |
| | k: v for d in data |
| | if isinstance(d, dict) |
| | for k, v in d.items() |
| | }) |
| |
|
| | return all_functions |
| |
|
| | except Exception as e: |
| | print(f"Error loading functions from database: {str(e)}") |
| | return {} |
| |
|
| | def clean_function_name(function_name, language): |
| | """ |
| | Removes language-specific keywords and strips leading/trailing whitespace using regex. |
| | |
| | Args: |
| | function_name: The original function name. |
| | language: The programming language of the function. |
| | Returns: |
| | The cleaned function name. |
| | """ |
| | keywords_dict = { |
| | "python": r"\b(def|class|async|await|for|while|if|else|try|except|finally)\b", |
| | "javascript": r"\b(function|class|async|await|const|let|var|if|else|for|while)\b", |
| | "typescript": r"\b(function|class|async|await|const|let|var|interface|type|enum|if|else|for|while)\b", |
| | "php": r"\b(function|class|public|private|protected|if|else|foreach|while|try|catch|finally)\b", |
| | "java": r"\b(class|public|private|protected|static|final|if|else|for|while|try|catch|finally)\b" |
| | } |
| | pattern = keywords_dict.get(language) |
| | if pattern: |
| | |
| | function_name = re.sub(pattern, "", function_name) |
| | return function_name.strip() |
| |
|
| | @lru_cache(maxsize=1000) |
| | def preprocess_text(text): |
| | """Preprocesses the text by converting to lowercase and lemmatizing.""" |
| | lemmatizer = WordNetLemmatizer() |
| | words = text.lower().split() |
| | lemmatized_words = [lemmatizer.lemmatize(word) for word in words] |
| | return " ".join(lemmatized_words) |
| |
|
| | def find_closest_operation_fuzzy(user_input, operations): |
| | """Finds the closest operation name using fuzzy matching.""" |
| | best_match = None |
| | best_score = 0 |
| | for operation_name in operations: |
| | score = fuzz.ratio(user_input, operation_name) |
| | if score > best_score: |
| | best_score = score |
| | best_match = operation_name |
| | return best_match if best_score >= 70 else None, MatchMethod.FUZZY |
| |
|
| | def find_closest_operation_substring(user_input, operations): |
| | """Finds the closest operation name using substring matching.""" |
| | for operation_name in operations: |
| | if user_input in operation_name: |
| | return operation_name, MatchMethod.SUBSTRING |
| | return None, MatchMethod.SUBSTRING |
| |
|
| | def train_ml_model(operations, language): |
| | """Trains a KNN model on operation names.""" |
| | operation_names = list(operations.keys()) |
| | vectorizer = TfidfVectorizer() |
| | X = vectorizer.fit_transform(operation_names) |
| | knn_model = KNeighborsClassifier(n_neighbors=3) |
| | knn_model.fit(X, operation_names) |
| | model_path = get_model_path(language) |
| | joblib.dump((knn_model, vectorizer), model_path) |
| | return knn_model, vectorizer |
| |
|
| | def load_ml_model(language): |
| | """Loads the trained KNN model and vectorizer.""" |
| | model_path = get_model_path(language) |
| | if os.path.exists(model_path): |
| | return joblib.load(model_path) |
| | else: |
| | return None, None |
| |
|
| | def predict_operation_name_ml(user_input, knn_model, vectorizer): |
| | """Predicts the operation name using the trained KNN model.""" |
| | input_vector = vectorizer.transform([user_input]) |
| | probabilities = knn_model.predict_proba(input_vector)[0] |
| | max_prob = max(probabilities) |
| | """If the maximum probability is less than 0.5, return an empty string.""" |
| | if max_prob < 0.5: |
| | return "", MatchMethod.ML |
| | return knn_model.predict(input_vector)[0], MatchMethod.ML |
| |
|
| | def get_operation_definition(user_input, language): |
| | operations = load_functions(language) |
| | cleaned_input = clean_function_name(user_input,language) |
| | preprocessed_input = preprocess_text(cleaned_input) |
| |
|
| | |
| | closest_match, method = find_closest_operation_substring(preprocessed_input, operations) |
| | if closest_match: |
| | return operations[closest_match] |
| | |
| | |
| | closest_match, method = find_closest_operation_fuzzy(preprocessed_input, operations) |
| | if closest_match: |
| | return operations[closest_match] |
| | |
| | knn_model, vectorizer = load_ml_model(language) |
| | if not knn_model: |
| | knn_model, vectorizer = train_ml_model(operations, language) |
| |
|
| | try: |
| | closest_match, method = predict_operation_name_ml(preprocessed_input, knn_model, vectorizer) |
| | if closest_match: |
| | return operations[closest_match] |
| | except Exception as e: |
| | return "" |
| | |
| | return "" |
| |
|