codewraith / data /source_files /messy /49cbaee2c42e.py
slenk's picture
Upload folder using huggingface_hub
eeef81e verified
# -*- coding: utf-8 -*-
"""Proiect.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1TR1Frf0EX4PtFZkLlVdGtMTINqhoQwRw
"""
# Importarea librariilor
import numpy as np
import pandas as pd # pandas pentru citirea fisierelor
from sklearn import preprocessing
from sklearn import svm # importarea modelului
from sklearn.feature_extraction.text import TfidfVectorizer # modelarea datelor pentru a obtine valori numerice din text
from sklearn.metrics import classification_report, confusion_matrix
# Incarcarea datelor
train_labels = pd.read_csv('train_labels.txt', sep='\t', header=None, engine='python')
train_labels = train_labels.to_numpy() # convertim data frame-ul intr-un vector
train_labels = train_labels[:,1] # pastram doar etichetele
train_samples = pd.read_csv('train_samples.txt', sep='\t', header=None, engine='python')
train_samples = train_samples.to_numpy()
train_samples = train_samples[:,1] # pastram doar cuvintele
validation_samples = pd.read_csv('validation_samples.txt', sep='\t', header=None, engine='python')
validation_samples = validation_samples.to_numpy()
validation_samples = validation_samples[:,1] # salvam cuvintele
validation_labels = pd.read_csv('validation_labels.txt', sep='\t', header=None, engine='python')
validation_labels = validation_labels.to_numpy()
validation_labels = validation_labels[:,1] # pastram doar etichetele
test_samples = pd.read_csv('test_samples.txt', sep='\t', header=None, engine='python')
test_samples = test_samples.to_numpy()
label = test_samples[:,0] # salvam etichetele
test_samples = test_samples[:,1] # salvam cuvintele
def normalize_data(train_data, test_data, type='l2'): # functia care intoarce datele normalizate
#tipul de normalizare este setat implicit la l2
scaler = None
if type == 'standard':
scaler = preprocessing.StandardScaler()
elif type == 'min_max':
scaler = preprocessing.MinMaxScaler()
elif type == 'l1' or type == 'l2':
scaler = preprocessing.Normalizer(norm = type)
if scaler is not None:
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)
return scaled_train_data, scaled_test_data
else:
return train_data, test_data
# Modelarea datelor
vectorizer = TfidfVectorizer()
training_features = vectorizer.fit_transform(train_samples)
validation_features = vectorizer.transform(validation_samples)
testing_features = vectorizer.transform(test_samples)
# Normalizarea datelor
norm_train, norm_test = normalize_data(training_features, testing_features)
norm_validation, _ = normalize_data(validation_features, validation_features)
# Aplicam modelul SVM
model_svm = svm.SVC(kernel='linear', C=23, gamma=110) # definim modelul
model_svm.fit(norm_train, train_labels) # procesul de invatare
test_predictions = model_svm.predict(norm_test) # predictie pe datele de test
print("Classification report: ")
print(classification_report(validation_labels, model_svm.predict(norm_validation)))
print("Confusion matrix: ")
print(confusion_matrix(validation_labels, model_svm.predict(norm_validation)))
# Exportarea datelor in format CSV
test_export = {'id':label,'label':test_predictions}
data_f = pd.DataFrame(test_export)
data_f.to_csv('test_submission.csv',index=False)