Spaces:

GaneshNaiknavare
/

Hello

Sleeping

File size: 6,553 Bytes

from functools import lru_cache
import json
from enum import Enum
import os
import re
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
import joblib
from nltk.stem import WordNetLemmatizer
import nltk
from model.db.db_setup import get_db, ExtractedFile

nltk.download('wordnet')

class MatchMethod(Enum):
    FUZZY = "fuzzy"
    SUBSTRING = "substring"
    ML = "ml"
    ML_ERROR = "ml_error"

def get_absolute_path(relative_path):
    abs_path=os.path.dirname(os.path.abspath(__file__))
    return os.path.join(abs_path, relative_path)

def get_model_path(language):
    """Returns the path to the KNN model file based on language."""
    return get_absolute_path(f'model/{language}_knn_model.h5')

def load_functions(language: str) -> dict:
    """
    Load functions from database based on language.
    
    Args:
        language: The programming language to load functions for.
    Returns:
        Dictionary of function names and their code.
    """
    # Language mapping moved to class constant
    LANGUAGE_MAPPING = {
        "python": "py",
        "javascript": "js",
        "typescript": "ts",
        "php": "php",
        "java": "java"
    }

    try:
        # Get file extension
        language = language.lower()
        language_ext = LANGUAGE_MAPPING.get(language)
        if not language_ext:
            raise ValueError(f"Unsupported language: {language}")

        # Use context manager for database session
        with get_db() as db:
            files = ExtractedFile.get_by_extension(db, f".{language_ext}")
            
            # Process files more efficiently
            all_functions = {}
            for file in files:
                data = file.file_data
                if isinstance(data, dict):
                    all_functions.update(data)
                elif isinstance(data, list):
                    all_functions.update({
                        k: v for d in data 
                        if isinstance(d, dict) 
                        for k, v in d.items()
                    })

            return all_functions

    except Exception as e:
        print(f"Error loading functions from database: {str(e)}")
        return {}

def clean_function_name(function_name, language):
    """
    Removes language-specific keywords and strips leading/trailing whitespace using regex.
    
    Args:
        function_name: The original function name.
        language: The programming language of the function.
    Returns:
        The cleaned function name.
    """
    keywords_dict = {
        "python": r"\b(def|class|async|await|for|while|if|else|try|except|finally)\b",
        "javascript": r"\b(function|class|async|await|const|let|var|if|else|for|while)\b",
        "typescript": r"\b(function|class|async|await|const|let|var|interface|type|enum|if|else|for|while)\b",
        "php": r"\b(function|class|public|private|protected|if|else|foreach|while|try|catch|finally)\b",
        "java": r"\b(class|public|private|protected|static|final|if|else|for|while|try|catch|finally)\b"
    }
    pattern = keywords_dict.get(language)
    if pattern:
        # Remove all language-specific keywords using regex
        function_name = re.sub(pattern, "", function_name)
    return function_name.strip()

@lru_cache(maxsize=1000)
def preprocess_text(text):
    """Preprocesses the text by converting to lowercase and lemmatizing."""
    lemmatizer = WordNetLemmatizer()
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

def find_closest_operation_fuzzy(user_input, operations):
    """Finds the closest operation name using fuzzy matching."""
    best_match = None
    best_score = 0
    for operation_name in operations:
        score = fuzz.ratio(user_input, operation_name)
        if score > best_score:
            best_score = score
            best_match = operation_name
    return best_match if best_score >= 70 else None, MatchMethod.FUZZY

def find_closest_operation_substring(user_input, operations):
    """Finds the closest operation name using substring matching."""
    for operation_name in operations:
        if user_input in operation_name:
            return operation_name, MatchMethod.SUBSTRING
    return None, MatchMethod.SUBSTRING

def train_ml_model(operations, language):
    """Trains a KNN model on operation names."""
    operation_names = list(operations.keys())
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(operation_names)
    knn_model = KNeighborsClassifier(n_neighbors=3)
    knn_model.fit(X, operation_names)
    model_path = get_model_path(language)
    joblib.dump((knn_model, vectorizer), model_path)
    return knn_model, vectorizer

def load_ml_model(language):
    """Loads the trained KNN model and vectorizer."""
    model_path = get_model_path(language)
    if os.path.exists(model_path):
        return joblib.load(model_path)
    else:
        return None, None

def predict_operation_name_ml(user_input, knn_model, vectorizer):
    """Predicts the operation name using the trained KNN model."""
    input_vector = vectorizer.transform([user_input])
    probabilities = knn_model.predict_proba(input_vector)[0]
    max_prob = max(probabilities)
    """If the maximum probability is less than 0.5, return an empty string."""
    if max_prob < 0.5:
        return "", MatchMethod.ML
    return knn_model.predict(input_vector)[0], MatchMethod.ML

def get_operation_definition(user_input, language):
    operations = load_functions(language)
    cleaned_input = clean_function_name(user_input,language)
    preprocessed_input = preprocess_text(cleaned_input)

    # First, try exact match
    closest_match, method = find_closest_operation_substring(preprocessed_input, operations)
    if closest_match:
        return operations[closest_match]
        
    # Next, try fuzzy matching
    closest_match, method = find_closest_operation_fuzzy(preprocessed_input, operations)
    if closest_match:
        return operations[closest_match]
    # Finally, try ML model
    knn_model, vectorizer = load_ml_model(language)
    if not knn_model:
        knn_model, vectorizer = train_ml_model(operations, language)

    try:
        closest_match, method = predict_operation_name_ml(preprocessed_input, knn_model, vectorizer)
        if closest_match:
            return operations[closest_match]
    except Exception as e:
        return ""
    
    return ""