Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """Proiect.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1TR1Frf0EX4PtFZkLlVdGtMTINqhoQwRw | |
| """ | |
| # Importarea librariilor | |
| import numpy as np | |
| import pandas as pd # pandas pentru citirea fisierelor | |
| from sklearn import preprocessing | |
| from sklearn import svm # importarea modelului | |
| from sklearn.feature_extraction.text import TfidfVectorizer # modelarea datelor pentru a obtine valori numerice din text | |
| from sklearn.metrics import classification_report, confusion_matrix | |
| # Incarcarea datelor | |
| train_labels = pd.read_csv('train_labels.txt', sep='\t', header=None, engine='python') | |
| train_labels = train_labels.to_numpy() # convertim data frame-ul intr-un vector | |
| train_labels = train_labels[:,1] # pastram doar etichetele | |
| train_samples = pd.read_csv('train_samples.txt', sep='\t', header=None, engine='python') | |
| train_samples = train_samples.to_numpy() | |
| train_samples = train_samples[:,1] # pastram doar cuvintele | |
| validation_samples = pd.read_csv('validation_samples.txt', sep='\t', header=None, engine='python') | |
| validation_samples = validation_samples.to_numpy() | |
| validation_samples = validation_samples[:,1] # salvam cuvintele | |
| validation_labels = pd.read_csv('validation_labels.txt', sep='\t', header=None, engine='python') | |
| validation_labels = validation_labels.to_numpy() | |
| validation_labels = validation_labels[:,1] # pastram doar etichetele | |
| test_samples = pd.read_csv('test_samples.txt', sep='\t', header=None, engine='python') | |
| test_samples = test_samples.to_numpy() | |
| label = test_samples[:,0] # salvam etichetele | |
| test_samples = test_samples[:,1] # salvam cuvintele | |
| def normalize_data(train_data, test_data, type='l2'): # functia care intoarce datele normalizate | |
| #tipul de normalizare este setat implicit la l2 | |
| scaler = None | |
| if type == 'standard': | |
| scaler = preprocessing.StandardScaler() | |
| elif type == 'min_max': | |
| scaler = preprocessing.MinMaxScaler() | |
| elif type == 'l1' or type == 'l2': | |
| scaler = preprocessing.Normalizer(norm = type) | |
| if scaler is not None: | |
| scaler.fit(train_data) | |
| scaled_train_data = scaler.transform(train_data) | |
| scaled_test_data = scaler.transform(test_data) | |
| return scaled_train_data, scaled_test_data | |
| else: | |
| return train_data, test_data | |
| # Modelarea datelor | |
| vectorizer = TfidfVectorizer() | |
| training_features = vectorizer.fit_transform(train_samples) | |
| validation_features = vectorizer.transform(validation_samples) | |
| testing_features = vectorizer.transform(test_samples) | |
| # Normalizarea datelor | |
| norm_train, norm_test = normalize_data(training_features, testing_features) | |
| norm_validation, _ = normalize_data(validation_features, validation_features) | |
| # Aplicam modelul SVM | |
| model_svm = svm.SVC(kernel='linear', C=23, gamma=110) # definim modelul | |
| model_svm.fit(norm_train, train_labels) # procesul de invatare | |
| test_predictions = model_svm.predict(norm_test) # predictie pe datele de test | |
| print("Classification report: ") | |
| print(classification_report(validation_labels, model_svm.predict(norm_validation))) | |
| print("Confusion matrix: ") | |
| print(confusion_matrix(validation_labels, model_svm.predict(norm_validation))) | |
| # Exportarea datelor in format CSV | |
| test_export = {'id':label,'label':test_predictions} | |
| data_f = pd.DataFrame(test_export) | |
| data_f.to_csv('test_submission.csv',index=False) |