File size: 3,062 Bytes

ca5bd57

import os
import argparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

MODEL_FILE = 'lang_classifier.pkl'
VECTORIZER_FILE = 'lang_vectorizer.pkl'

def train(dataset_directory="datasets"):
    text_samples = []
    language_labels = []
    
    # 1. Automatically scan the directory for .txt files
    try:
        # Get all files ending in .txt
        files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')]
    except FileNotFoundError:
        print(f"Error: The directory '{data_directory}' does not exist.")
        return

    if len(files) < 1:
        raise FileNotFoundError(f"Error: No language dataset found")

    for filename in files:
        file_path = os.path.join(dataset_directory, filename)
        
        # Determine language name from the filename
        language_name = filename.replace(".txt", "")
        
        print(f"Processing: {language_name}...")
        
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                clean_text = line.strip()
                if clean_text: 
                    text_samples.append(clean_text)
                    language_labels.append(language_name)

    # 2. Text to Numeric Vector (Character-level analysis)
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
    feature_matrix = vectorizer.fit_transform(text_samples)

    # 3. Multiclass Classifier
    # 'classifier' or 'clf' is a standard name for the model variable
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(feature_matrix, language_labels)

    # 4. Save the artifacts
    joblib.dump(classifier, MODEL_FILE)
    joblib.dump(vectorizer, VECTORIZER_FILE)
    
    print(f"Training complete! Languages detected: {classifier.classes_}")

def predict(text: str):
    try:
        classifier = joblib.load(MODEL_FILE)
        vectorizer = joblib.load(VECTORIZER_FILE)
        
        X_new = vectorizer.transform([text])
        prediction = classifier.predict(X_new)
        
        # Get probabilities to see how confident the model is
        probs = classifier.predict_proba(X_new)
        confidence = max(probs[0]) * 100
        
        print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)")
    except FileNotFoundError:
        print("Error: Model files not found. Please train the model first using --train")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Language Detection Tool")
    
    # Add arguments
    parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory")
    parser.add_argument("--detect", type=str, help="Detect the language of the provided string")
    parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)")

    args = parser.parse_args()

    if args.train:
        train(args.dir)
    elif args.detect:
        predict(args.detect)
    else:
        parser.print_help()