# De mexico para el mundo, arman77mx@gmail.com import PyPDF2 import os import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import classification_report import pickle # Función para extraer texto de un PDF def extract_text_from_pdf(pdf_path): with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = '' for page in reader.pages: text += page.extract_text() return text # Directorios de PDFs etiquetados pdf_dirs = { 'si-firma': '/content/si-firma', 'no-firma': '/content/no-firma' } # Extracción y etiquetado de textos data = [] labels = [] for label, pdf_dir in pdf_dirs.items(): for pdf_file in os.listdir(pdf_dir): if pdf_file.endswith('.pdf'): pdf_path = os.path.join(pdf_dir, pdf_file) text = extract_text_from_pdf(pdf_path) text = re.sub(r'\s+', ' ', text).strip() # Preprocesamiento básico data.append(text) labels.append(label) # Vectorización de textos vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data) y = labels # División de datos en entrenamiento y prueba X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Entrenamiento del modelo model = SVC(kernel='linear') model.fit(X_train, y_train) # Evaluación del modelo y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) # Guardar el vectorizador with open('vectorizer.pkl', 'wb') as file: pickle.dump(vectorizer, file) # Guardar el modelo with open('model.pkl', 'wb') as file: pickle.dump(model, file)