Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| from sklearn.metrics import classification_report | |
| import pandas as pd | |
| import nltk | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| #nltk.download('punkt_tab') | |
| def load_data(base_path, max_files=1000): | |
| """ | |
| Load text files from directories and split them into sentences. | |
| """ | |
| texts = [] | |
| labels = [] | |
| for category in os.listdir(base_path): | |
| category_path = os.path.join(base_path, category) | |
| if not os.path.isdir(category_path): | |
| continue | |
| file_count = 0 | |
| for filename in os.listdir(category_path): | |
| if not filename.endswith('.txt'): | |
| continue | |
| if file_count >= max_files: | |
| break | |
| with open(os.path.join(category_path, filename), 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| sentences = nltk.sent_tokenize(content) | |
| texts.extend(sentences) | |
| # Create multi-label format | |
| labels.extend([[category] for _ in sentences]) | |
| file_count += 1 | |
| return texts, labels | |
| class IndependentMultiLabelClassifier: | |
| def __init__(self): | |
| self.vectorizer = TfidfVectorizer(max_features=5000) | |
| self.mlb = MultiLabelBinarizer() | |
| self.classifiers = {} | |
| def fit(self, X, y): | |
| # Transform text features | |
| X_transformed = self.vectorizer.fit_transform(X) | |
| # Transform labels and get all possible categories | |
| y_transformed = self.mlb.fit_transform(y) | |
| self.categories = self.mlb.classes_ | |
| # Train a binary classifier for each category | |
| for i, category in enumerate(self.categories): | |
| print(f"\nTraining classifier for: {category}") | |
| clf = LogisticRegression(max_iter=1000, class_weight='balanced') | |
| y_binary = y_transformed[:, i] | |
| clf.fit(X_transformed, y_binary) | |
| self.classifiers[category] = clf | |
| # Print performance metrics | |
| y_pred = clf.predict(X_transformed) | |
| print(classification_report(y_binary, y_pred)) | |
| def predict_proba(self, X): | |
| # Transform text | |
| X_transformed = self.vectorizer.transform(X) | |
| # Get independent probabilities for each category | |
| predictions = [] | |
| for category, clf in self.classifiers.items(): | |
| # Get raw probability for positive class | |
| prob = clf.predict_proba(X_transformed)[0][1] | |
| predictions.append((category, prob)) | |
| return sorted(predictions, key=lambda x: x[1], reverse=True) | |
| # Example usage | |
| if __name__ == "__main__": | |
| base_path = "/content/extracted_files/Uzbek_News_Dataset" | |
| print("Loading data...") | |
| texts, labels = load_data(base_path) | |
| print("Training independent classifiers...") | |
| classifier = IndependentMultiLabelClassifier() | |
| classifier.fit(texts, labels) | |
| # Test prediction | |
| test_text = "Amerikada zilzila sodir bo'ldi" | |
| predictions = classifier.predict_proba([test_text]) | |
| print(f"\nIndependent prediction scores for: {test_text}") | |
| for category, prob in predictions: | |
| print(f"{category}: {prob*100:.1f}%") |