Spaces:
Sleeping
Sleeping
| # De mexico para el mundo, arman77mx@gmail.com | |
| import PyPDF2 | |
| import os | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.svm import SVC | |
| from sklearn.metrics import classification_report | |
| import pickle | |
| # Funci贸n para extraer texto de un PDF | |
| def extract_text_from_pdf(pdf_path): | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = '' | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Directorios de PDFs etiquetados | |
| pdf_dirs = { | |
| 'si-firma': '/content/si-firma', | |
| 'no-firma': '/content/no-firma' | |
| } | |
| # Extracci贸n y etiquetado de textos | |
| data = [] | |
| labels = [] | |
| for label, pdf_dir in pdf_dirs.items(): | |
| for pdf_file in os.listdir(pdf_dir): | |
| if pdf_file.endswith('.pdf'): | |
| pdf_path = os.path.join(pdf_dir, pdf_file) | |
| text = extract_text_from_pdf(pdf_path) | |
| text = re.sub(r'\s+', ' ', text).strip() # Preprocesamiento b谩sico | |
| data.append(text) | |
| labels.append(label) | |
| # Vectorizaci贸n de textos | |
| vectorizer = TfidfVectorizer() | |
| X = vectorizer.fit_transform(data) | |
| y = labels | |
| # Divisi贸n de datos en entrenamiento y prueba | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Entrenamiento del modelo | |
| model = SVC(kernel='linear') | |
| model.fit(X_train, y_train) | |
| # Evaluaci贸n del modelo | |
| y_pred = model.predict(X_test) | |
| print(classification_report(y_test, y_pred)) | |
| # Guardar el vectorizador | |
| with open('vectorizer.pkl', 'wb') as file: | |
| pickle.dump(vectorizer, file) | |
| # Guardar el modelo | |
| with open('model.pkl', 'wb') as file: | |
| pickle.dump(model, file) | |