Isa0 commited on
Commit
ca5bd57
·
1 Parent(s): 455cb0d

feat: create main.py with training and using commands

Browse files
Files changed (1) hide show
  1. main.py +86 -0
main.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ import joblib
6
+
7
+ MODEL_FILE = 'lang_classifier.pkl'
8
+ VECTORIZER_FILE = 'lang_vectorizer.pkl'
9
+
10
+ def train(dataset_directory="datasets"):
11
+ text_samples = []
12
+ language_labels = []
13
+
14
+ # 1. Automatically scan the directory for .txt files
15
+ try:
16
+ # Get all files ending in .txt
17
+ files = [f for f in os.listdir(dataset_directory) if f.endswith('.txt')]
18
+ except FileNotFoundError:
19
+ print(f"Error: The directory '{data_directory}' does not exist.")
20
+ return
21
+
22
+ if len(files) < 1:
23
+ raise FileNotFoundError(f"Error: No language dataset found")
24
+
25
+ for filename in files:
26
+ file_path = os.path.join(dataset_directory, filename)
27
+
28
+ # Determine language name from the filename
29
+ language_name = filename.replace(".txt", "")
30
+
31
+ print(f"Processing: {language_name}...")
32
+
33
+ with open(file_path, "r", encoding="utf-8") as f:
34
+ for line in f:
35
+ clean_text = line.strip()
36
+ if clean_text:
37
+ text_samples.append(clean_text)
38
+ language_labels.append(language_name)
39
+
40
+ # 2. Text to Numeric Vector (Character-level analysis)
41
+ vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
42
+ feature_matrix = vectorizer.fit_transform(text_samples)
43
+
44
+ # 3. Multiclass Classifier
45
+ # 'classifier' or 'clf' is a standard name for the model variable
46
+ classifier = LogisticRegression(max_iter=1000)
47
+ classifier.fit(feature_matrix, language_labels)
48
+
49
+ # 4. Save the artifacts
50
+ joblib.dump(classifier, MODEL_FILE)
51
+ joblib.dump(vectorizer, VECTORIZER_FILE)
52
+
53
+ print(f"Training complete! Languages detected: {classifier.classes_}")
54
+
55
+ def predict(text: str):
56
+ try:
57
+ classifier = joblib.load(MODEL_FILE)
58
+ vectorizer = joblib.load(VECTORIZER_FILE)
59
+
60
+ X_new = vectorizer.transform([text])
61
+ prediction = classifier.predict(X_new)
62
+
63
+ # Get probabilities to see how confident the model is
64
+ probs = classifier.predict_proba(X_new)
65
+ confidence = max(probs[0]) * 100
66
+
67
+ print(f"Language: {prediction[0]} ({confidence:.2f}% confidence)")
68
+ except FileNotFoundError:
69
+ print("Error: Model files not found. Please train the model first using --train")
70
+
71
+ if __name__ == "__main__":
72
+ parser = argparse.ArgumentParser(description="Language Detection Tool")
73
+
74
+ # Add arguments
75
+ parser.add_argument("--train", action="store_true", help="Train the model using the datasets directory")
76
+ parser.add_argument("--detect", type=str, help="Detect the language of the provided string")
77
+ parser.add_argument("--dir", type=str, default="datasets", help="Directory for training data (default: datasets)")
78
+
79
+ args = parser.parse_args()
80
+
81
+ if args.train:
82
+ train(args.dir)
83
+ elif args.detect:
84
+ predict(args.detect)
85
+ else:
86
+ parser.print_help()