feat: create main.py with training and using commands

ca5bd57 15 days ago

3.06 kB

	import os
	import argparse
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.linear_model import LogisticRegression
	import joblib

	MODEL_FILE = 'lang_classifier.pkl'
	VECTORIZER_FILE = 'lang_vectorizer.pkl'

	def train(dataset_directory="datasets"):
	text_samples = []
	language_labels = []

	# 1. Automatically scan the directory for .txt files
	try:
	# Get all files ending in .txt
	files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')]
	except FileNotFoundError:
	print(f"Error: The directory '{data_directory}' does not exist.")
	return

	if len(files) < 1:
	raise FileNotFoundError(f"Error: No language dataset found")

	for filename in files:
	file_path = os.path.join(dataset_directory, filename)

	# Determine language name from the filename
	language_name = filename.replace(".txt", "")

	print(f"Processing: {language_name}...")

	with open(file_path, "r", encoding="utf-8") as f:
	for line in f:
	clean_text = line.strip()
	if clean_text:
	text_samples.append(clean_text)
	language_labels.append(language_name)

	# 2. Text to Numeric Vector (Character-level analysis)
	vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
	feature_matrix = vectorizer.fit_transform(text_samples)

	# 3. Multiclass Classifier
	# 'classifier' or 'clf' is a standard name for the model variable
	classifier = LogisticRegression(max_iter=1000)
	classifier.fit(feature_matrix, language_labels)

	# 4. Save the artifacts
	joblib.dump(classifier, MODEL_FILE)
	joblib.dump(vectorizer, VECTORIZER_FILE)

	print(f"Training complete! Languages detected: {classifier.classes_}")

	def predict(text: str):
	try:
	classifier = joblib.load(MODEL_FILE)
	vectorizer = joblib.load(VECTORIZER_FILE)

	X_new = vectorizer.transform([text])
	prediction = classifier.predict(X_new)

	# Get probabilities to see how confident the model is
	probs = classifier.predict_proba(X_new)
	confidence = max(probs[0]) * 100

	print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)")
	except FileNotFoundError:
	print("Error: Model files not found. Please train the model first using --train")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Language Detection Tool")

	# Add arguments
	parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory")
	parser.add_argument("--detect", type=str, help="Detect the language of the provided string")
	parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)")

	args = parser.parse_args()

	if args.train:
	train(args.dir)
	elif args.detect:
	predict(args.detect)
	else:
	parser.print_help()