File size: 1,726 Bytes
1fc6ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# De mexico para el mundo, arman77mx@gmail.com
import PyPDF2
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pickle

# Funci贸n para extraer texto de un PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# Directorios de PDFs etiquetados
pdf_dirs = {
    'si-firma': '/content/si-firma',
    'no-firma': '/content/no-firma'
}

# Extracci贸n y etiquetado de textos
data = []
labels = []

for label, pdf_dir in pdf_dirs.items():
    for pdf_file in os.listdir(pdf_dir):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_dir, pdf_file)
            text = extract_text_from_pdf(pdf_path)
            text = re.sub(r'\s+', ' ', text).strip()  # Preprocesamiento b谩sico
            data.append(text)
            labels.append(label)

# Vectorizaci贸n de textos
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
y = labels

# Divisi贸n de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Evaluaci贸n del modelo
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Guardar el vectorizador
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Guardar el modelo
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)