| import os |
| import argparse |
| from sklearn.feature_extraction.text import CountVectorizer |
| from sklearn.linear_model import LogisticRegression |
| import joblib |
|
|
| MODEL_FILE = 'lang_classifier.pkl' |
| VECTORIZER_FILE = 'lang_vectorizer.pkl' |
|
|
| def train(dataset_directory="datasets"): |
| text_samples = [] |
| language_labels = [] |
| |
| |
| try: |
| |
| files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')] |
| except FileNotFoundError: |
| print(f"Error: The directory '{data_directory}' does not exist.") |
| return |
|
|
| if len(files) < 1: |
| raise FileNotFoundError(f"Error: No language dataset found") |
|
|
| for filename in files: |
| file_path = os.path.join(dataset_directory, filename) |
| |
| |
| language_name = filename.replace(".txt", "") |
| |
| print(f"Processing: {language_name}...") |
| |
| with open(file_path, "r", encoding="utf-8") as f: |
| for line in f: |
| clean_text = line.strip() |
| if clean_text: |
| text_samples.append(clean_text) |
| language_labels.append(language_name) |
|
|
| |
| vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3)) |
| feature_matrix = vectorizer.fit_transform(text_samples) |
|
|
| |
| |
| classifier = LogisticRegression(max_iter=1000) |
| classifier.fit(feature_matrix, language_labels) |
|
|
| |
| joblib.dump(classifier, MODEL_FILE) |
| joblib.dump(vectorizer, VECTORIZER_FILE) |
| |
| print(f"Training complete! Languages detected: {classifier.classes_}") |
|
|
| def predict(text: str): |
| try: |
| classifier = joblib.load(MODEL_FILE) |
| vectorizer = joblib.load(VECTORIZER_FILE) |
| |
| X_new = vectorizer.transform([text]) |
| prediction = classifier.predict(X_new) |
| |
| |
| probs = classifier.predict_proba(X_new) |
| confidence = max(probs[0]) * 100 |
| |
| print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)") |
| except FileNotFoundError: |
| print("Error: Model files not found. Please train the model first using --train") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Language Detection Tool") |
| |
| |
| parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory") |
| parser.add_argument("--detect", type=str, help="Detect the language of the provided string") |
| parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)") |
|
|
| args = parser.parse_args() |
|
|
| if args.train: |
| train(args.dir) |
| elif args.detect: |
| predict(args.detect) |
| else: |
| parser.print_help() |
|
|