import os import argparse from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression import joblib MODEL_FILE = 'lang_classifier.pkl' VECTORIZER_FILE = 'lang_vectorizer.pkl' def train(dataset_directory="datasets"): text_samples = [] language_labels = [] # 1. Automatically scan the directory for .txt files try: # Get all files ending in .txt files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')] except FileNotFoundError: print(f"Error: The directory '{data_directory}' does not exist.") return if len(files) < 1: raise FileNotFoundError(f"Error: No language dataset found") for filename in files: file_path = os.path.join(dataset_directory, filename) # Determine language name from the filename language_name = filename.replace(".txt", "") print(f"Processing: {language_name}...") with open(file_path, "r", encoding="utf-8") as f: for line in f: clean_text = line.strip() if clean_text: text_samples.append(clean_text) language_labels.append(language_name) # 2. Text to Numeric Vector (Character-level analysis) vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3)) feature_matrix = vectorizer.fit_transform(text_samples) # 3. Multiclass Classifier # 'classifier' or 'clf' is a standard name for the model variable classifier = LogisticRegression(max_iter=1000) classifier.fit(feature_matrix, language_labels) # 4. Save the artifacts joblib.dump(classifier, MODEL_FILE) joblib.dump(vectorizer, VECTORIZER_FILE) print(f"Training complete! Languages detected: {classifier.classes_}") def predict(text: str): try: classifier = joblib.load(MODEL_FILE) vectorizer = joblib.load(VECTORIZER_FILE) X_new = vectorizer.transform([text]) prediction = classifier.predict(X_new) # Get probabilities to see how confident the model is probs = classifier.predict_proba(X_new) confidence = max(probs[0]) * 100 print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)") except FileNotFoundError: print("Error: Model files not found. Please train the model first using --train") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Language Detection Tool") # Add arguments parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory") parser.add_argument("--detect", type=str, help="Detect the language of the provided string") parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)") args = parser.parse_args() if args.train: train(args.dir) elif args.detect: predict(args.detect) else: parser.print_help()