File size: 3,062 Bytes
ca5bd57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | import os
import argparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
MODEL_FILE = 'lang_classifier.pkl'
VECTORIZER_FILE = 'lang_vectorizer.pkl'
def train(dataset_directory="datasets"):
text_samples = []
language_labels = []
# 1. Automatically scan the directory for .txt files
try:
# Get all files ending in .txt
files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')]
except FileNotFoundError:
print(f"Error: The directory '{data_directory}' does not exist.")
return
if len(files) < 1:
raise FileNotFoundError(f"Error: No language dataset found")
for filename in files:
file_path = os.path.join(dataset_directory, filename)
# Determine language name from the filename
language_name = filename.replace(".txt", "")
print(f"Processing: {language_name}...")
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
clean_text = line.strip()
if clean_text:
text_samples.append(clean_text)
language_labels.append(language_name)
# 2. Text to Numeric Vector (Character-level analysis)
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
feature_matrix = vectorizer.fit_transform(text_samples)
# 3. Multiclass Classifier
# 'classifier' or 'clf' is a standard name for the model variable
classifier = LogisticRegression(max_iter=1000)
classifier.fit(feature_matrix, language_labels)
# 4. Save the artifacts
joblib.dump(classifier, MODEL_FILE)
joblib.dump(vectorizer, VECTORIZER_FILE)
print(f"Training complete! Languages detected: {classifier.classes_}")
def predict(text: str):
try:
classifier = joblib.load(MODEL_FILE)
vectorizer = joblib.load(VECTORIZER_FILE)
X_new = vectorizer.transform([text])
prediction = classifier.predict(X_new)
# Get probabilities to see how confident the model is
probs = classifier.predict_proba(X_new)
confidence = max(probs[0]) * 100
print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)")
except FileNotFoundError:
print("Error: Model files not found. Please train the model first using --train")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Language Detection Tool")
# Add arguments
parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory")
parser.add_argument("--detect", type=str, help="Detect the language of the provided string")
parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)")
args = parser.parse_args()
if args.train:
train(args.dir)
elif args.detect:
predict(args.detect)
else:
parser.print_help()
|