# -*- coding: utf-8 -*- """Proiect.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1TR1Frf0EX4PtFZkLlVdGtMTINqhoQwRw """ # Importarea librariilor import numpy as np import pandas as pd # pandas pentru citirea fisierelor from sklearn import preprocessing from sklearn import svm # importarea modelului from sklearn.feature_extraction.text import TfidfVectorizer # modelarea datelor pentru a obtine valori numerice din text from sklearn.metrics import classification_report, confusion_matrix # Incarcarea datelor train_labels = pd.read_csv('train_labels.txt', sep='\t', header=None, engine='python') train_labels = train_labels.to_numpy() # convertim data frame-ul intr-un vector train_labels = train_labels[:,1] # pastram doar etichetele train_samples = pd.read_csv('train_samples.txt', sep='\t', header=None, engine='python') train_samples = train_samples.to_numpy() train_samples = train_samples[:,1] # pastram doar cuvintele validation_samples = pd.read_csv('validation_samples.txt', sep='\t', header=None, engine='python') validation_samples = validation_samples.to_numpy() validation_samples = validation_samples[:,1] # salvam cuvintele validation_labels = pd.read_csv('validation_labels.txt', sep='\t', header=None, engine='python') validation_labels = validation_labels.to_numpy() validation_labels = validation_labels[:,1] # pastram doar etichetele test_samples = pd.read_csv('test_samples.txt', sep='\t', header=None, engine='python') test_samples = test_samples.to_numpy() label = test_samples[:,0] # salvam etichetele test_samples = test_samples[:,1] # salvam cuvintele def normalize_data(train_data, test_data, type='l2'): # functia care intoarce datele normalizate #tipul de normalizare este setat implicit la l2 scaler = None if type == 'standard': scaler = preprocessing.StandardScaler() elif type == 'min_max': scaler = preprocessing.MinMaxScaler() elif type == 'l1' or type == 'l2': scaler = preprocessing.Normalizer(norm = type) if scaler is not None: scaler.fit(train_data) scaled_train_data = scaler.transform(train_data) scaled_test_data = scaler.transform(test_data) return scaled_train_data, scaled_test_data else: return train_data, test_data # Modelarea datelor vectorizer = TfidfVectorizer() training_features = vectorizer.fit_transform(train_samples) validation_features = vectorizer.transform(validation_samples) testing_features = vectorizer.transform(test_samples) # Normalizarea datelor norm_train, norm_test = normalize_data(training_features, testing_features) norm_validation, _ = normalize_data(validation_features, validation_features) # Aplicam modelul SVM model_svm = svm.SVC(kernel='linear', C=23, gamma=110) # definim modelul model_svm.fit(norm_train, train_labels) # procesul de invatare test_predictions = model_svm.predict(norm_test) # predictie pe datele de test print("Classification report: ") print(classification_report(validation_labels, model_svm.predict(norm_validation))) print("Confusion matrix: ") print(confusion_matrix(validation_labels, model_svm.predict(norm_validation))) # Exportarea datelor in format CSV test_export = {'id':label,'label':test_predictions} data_f = pd.DataFrame(test_export) data_f.to_csv('test_submission.csv',index=False)