Valida-Pdf-Mx / code-modelo-mx.py
arman77mxx's picture
primera version de codigo, arre-MX
1fc6ad4 verified
# De mexico para el mundo, arman77mx@gmail.com
import PyPDF2
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pickle
# Funci贸n para extraer texto de un PDF
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
# Directorios de PDFs etiquetados
pdf_dirs = {
'si-firma': '/content/si-firma',
'no-firma': '/content/no-firma'
}
# Extracci贸n y etiquetado de textos
data = []
labels = []
for label, pdf_dir in pdf_dirs.items():
for pdf_file in os.listdir(pdf_dir):
if pdf_file.endswith('.pdf'):
pdf_path = os.path.join(pdf_dir, pdf_file)
text = extract_text_from_pdf(pdf_path)
text = re.sub(r'\s+', ' ', text).strip() # Preprocesamiento b谩sico
data.append(text)
labels.append(label)
# Vectorizaci贸n de textos
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
y = labels
# Divisi贸n de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Entrenamiento del modelo
model = SVC(kernel='linear')
model.fit(X_train, y_train)
# Evaluaci贸n del modelo
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
# Guardar el vectorizador
with open('vectorizer.pkl', 'wb') as file:
pickle.dump(vectorizer, file)
# Guardar el modelo
with open('model.pkl', 'wb') as file:
pickle.dump(model, file)