codificacionNLP / app.py
Rajor78's picture
Create app.py
9e46923 verified
raw
history blame
5.99 kB
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from datetime import datetime
import re
# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
class LiteralEncoder:
def __init__(self):
# Modelo de embeddings multiling眉e
self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
self.stemmer = SnowballStemmer('spanish')
self.stop_words = set(stopwords.words('spanish'))
self.literal_to_codes = {}
self.embeddings = {}
def preprocess_literal(self, text):
"""Preprocesa el literal para mejor comparaci贸n"""
text = str(text).lower().strip()
text = re.sub(r'[^\w\s]', ' ', text)
tokens = word_tokenize(text)
tokens = [self.stemmer.stem(token) for token in tokens
if token not in self.stop_words]
return ' '.join(tokens)
def train(self, training_df):
"""Entrena el codificador con los datos de ejemplo"""
# Procesar cada literal y sus c贸digos
for _, row in training_df.iterrows():
literal = str(row['B']).strip()
codes = str(row['C']).strip().split(';')
codes = [code.strip() for code in codes]
processed_literal = self.preprocess_literal(literal)
self.literal_to_codes[literal] = {
'codes': codes,
'processed': processed_literal
}
# Generar embeddings para todos los literales
processed_literals = [v['processed'] for v in self.literal_to_codes.values()]
all_embeddings = self.model.encode(processed_literals)
for (literal, data), embedding in zip(self.literal_to_codes.items(), all_embeddings):
self.literal_to_codes[literal]['embedding'] = embedding
def encode_literal(self, literal, threshold=0.7):
"""Codifica un nuevo literal basado en similitud"""
processed = self.preprocess_literal(literal)
literal_embedding = self.model.encode([processed])[0]
best_similarity = 0
best_match = None
best_codes = []
for train_literal, data in self.literal_to_codes.items():
similarity = cosine_similarity(
[literal_embedding],
[data['embedding']]
)[0][0]
if similarity > best_similarity:
best_similarity = similarity
best_match = train_literal
best_codes = data['codes']
if best_similarity >= threshold:
return {
'codes': best_codes,
'similarity': best_similarity,
'matched_literal': best_match
}
else:
return {
'codes': [],
'similarity': 0,
'matched_literal': 'NO_MATCH'
}
def process_excel(training_file, new_file, confidence_threshold=0.7):
"""Procesa los archivos Excel"""
try:
# Leer archivos
training_df = pd.read_excel(training_file.name)
new_df = pd.read_excel(new_file.name)
# Inicializar y entrenar el codificador
encoder = LiteralEncoder()
encoder.train(training_df)
# Preparar DataFrame de resultados
results_df = new_df.copy()
results_df['C贸digos_Asignados'] = ''
results_df['Literal_Original'] = ''
results_df['Score_Similitud'] = 0.0
# Codificar cada literal nuevo
for idx, row in results_df.iterrows():
literal = str(row['B'])
result = encoder.encode_literal(literal, confidence_threshold)
results_df.at[idx, 'C贸digos_Asignados'] = (
'; '.join(result['codes']) if result['codes']
else 'SIN_MATCH'
)
results_df.at[idx, 'Literal_Original'] = result['matched_literal']
results_df.at[idx, 'Score_Similitud'] = round(result['similarity'], 3)
# Generar estad铆sticas
total = len(results_df)
matched = len(results_df[results_df['C贸digos_Asignados'] != 'SIN_MATCH'])
stats_df = pd.DataFrame({
'M茅trica': [
'Total Literales',
'Literales Codificados',
'Sin Coincidencia',
'Porcentaje 脡xito'
],
'Valor': [
total,
matched,
total - matched,
f"{(matched/total*100):.1f}%"
]
})
# Guardar resultados
output_name = f"codificacion_literales_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
with pd.ExcelWriter(output_name) as writer:
results_df.to_excel(writer, sheet_name='Resultados', index=False)
stats_df.to_excel(writer, sheet_name='Resumen', index=False)
training_df.to_excel(writer, sheet_name='Datos_Training', index=False)
return output_name
except Exception as e:
return f"Error: {str(e)}"
# Interfaz Gradio
iface = gr.Interface(
fn=process_excel,
inputs=[
gr.File(label="Excel con literales de entrenamiento (B: literales, C: c贸digos)"),
gr.File(label="Excel con nuevos literales a codificar"),
gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.7,
label="Umbral de confianza (0-1)"
)
],
outputs=gr.File(label="Excel con resultados"),
title="Codificador Autom谩tico de Literales",
description="Codifica autom谩ticamente literales bas谩ndose en ejemplos previos. Los c贸digos m煤ltiples deben estar separados por punto y coma (;) en la columna C."
)
if __name__ == "__main__":
iface.launch()