Isa0
/

language-detection

Model card Files Files and versions

xet

Community

Isa0 commited on 15 days ago

Commit

ca5bd57

1 Parent(s): 455cb0d

feat: create main.py with training and using commands

Browse files

Files changed (1) hide show

main.py +86 -0

main.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import argparse
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+import joblib
+MODEL_FILE = 'lang_classifier.pkl'
+VECTORIZER_FILE = 'lang_vectorizer.pkl'
+def train(dataset_directory="datasets"):
+    text_samples = []
+    language_labels = []
+    # 1. Automatically scan the directory for .txt files
+    try:
+        # Get all files ending in .txt
+        files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')]
+    except FileNotFoundError:
+        print(f"Error: The directory '{data_directory}' does not exist.")
+        return
+    if len(files) < 1:
+        raise FileNotFoundError(f"Error: No language dataset found")
+    for filename in files:
+        file_path = os.path.join(dataset_directory, filename)
+        # Determine language name from the filename
+        language_name = filename.replace(".txt", "")
+        print(f"Processing: {language_name}...")
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                clean_text = line.strip()
+                if clean_text:
+                    text_samples.append(clean_text)
+                    language_labels.append(language_name)
+    # 2. Text to Numeric Vector (Character-level analysis)
+    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
+    feature_matrix = vectorizer.fit_transform(text_samples)
+    # 3. Multiclass Classifier
+    # 'classifier' or 'clf' is a standard name for the model variable
+    classifier = LogisticRegression(max_iter=1000)
+    classifier.fit(feature_matrix, language_labels)
+    # 4. Save the artifacts
+    joblib.dump(classifier, MODEL_FILE)
+    joblib.dump(vectorizer, VECTORIZER_FILE)
+    print(f"Training complete! Languages detected: {classifier.classes_}")
+def predict(text: str):
+    try:
+        classifier = joblib.load(MODEL_FILE)
+        vectorizer = joblib.load(VECTORIZER_FILE)
+        X_new = vectorizer.transform([text])
+        prediction = classifier.predict(X_new)
+        # Get probabilities to see how confident the model is
+        probs = classifier.predict_proba(X_new)
+        confidence = max(probs[0]) * 100
+        print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)")
+    except FileNotFoundError:
+        print("Error: Model files not found. Please train the model first using --train")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Language Detection Tool")
+    # Add arguments
+    parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory")
+    parser.add_argument("--detect", type=str, help="Detect the language of the provided string")
+    parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)")
+    args = parser.parse_args()
+    if args.train:
+        train(args.dir)
+    elif args.detect:
+        predict(args.detect)
+    else:
+        parser.print_help()