Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import SnowballStemmer | |
| from datetime import datetime | |
| import re | |
| # Descargar recursos de NLTK | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| class LiteralEncoder: | |
| def __init__(self): | |
| # Modelo de embeddings multiling眉e | |
| self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') | |
| self.stemmer = SnowballStemmer('spanish') | |
| self.stop_words = set(stopwords.words('spanish')) | |
| self.literal_to_codes = {} | |
| self.embeddings = {} | |
| def preprocess_literal(self, text): | |
| """Preprocesa el literal para mejor comparaci贸n""" | |
| text = str(text).lower().strip() | |
| text = re.sub(r'[^\w\s]', ' ', text) | |
| tokens = word_tokenize(text) | |
| tokens = [self.stemmer.stem(token) for token in tokens | |
| if token not in self.stop_words] | |
| return ' '.join(tokens) | |
| def train(self, training_df): | |
| """Entrena el codificador con los datos de ejemplo""" | |
| # Procesar cada literal y sus c贸digos | |
| for _, row in training_df.iterrows(): | |
| literal = str(row['B']).strip() | |
| codes = str(row['C']).strip().split(';') | |
| codes = [code.strip() for code in codes] | |
| processed_literal = self.preprocess_literal(literal) | |
| self.literal_to_codes[literal] = { | |
| 'codes': codes, | |
| 'processed': processed_literal | |
| } | |
| # Generar embeddings para todos los literales | |
| processed_literals = [v['processed'] for v in self.literal_to_codes.values()] | |
| all_embeddings = self.model.encode(processed_literals) | |
| for (literal, data), embedding in zip(self.literal_to_codes.items(), all_embeddings): | |
| self.literal_to_codes[literal]['embedding'] = embedding | |
| def encode_literal(self, literal, threshold=0.7): | |
| """Codifica un nuevo literal basado en similitud""" | |
| processed = self.preprocess_literal(literal) | |
| literal_embedding = self.model.encode([processed])[0] | |
| best_similarity = 0 | |
| best_match = None | |
| best_codes = [] | |
| for train_literal, data in self.literal_to_codes.items(): | |
| similarity = cosine_similarity( | |
| [literal_embedding], | |
| [data['embedding']] | |
| )[0][0] | |
| if similarity > best_similarity: | |
| best_similarity = similarity | |
| best_match = train_literal | |
| best_codes = data['codes'] | |
| if best_similarity >= threshold: | |
| return { | |
| 'codes': best_codes, | |
| 'similarity': best_similarity, | |
| 'matched_literal': best_match | |
| } | |
| else: | |
| return { | |
| 'codes': [], | |
| 'similarity': 0, | |
| 'matched_literal': 'NO_MATCH' | |
| } | |
| def process_excel(training_file, new_file, confidence_threshold=0.7): | |
| """Procesa los archivos Excel""" | |
| try: | |
| # Leer archivos | |
| training_df = pd.read_excel(training_file.name) | |
| new_df = pd.read_excel(new_file.name) | |
| # Inicializar y entrenar el codificador | |
| encoder = LiteralEncoder() | |
| encoder.train(training_df) | |
| # Preparar DataFrame de resultados | |
| results_df = new_df.copy() | |
| results_df['C贸digos_Asignados'] = '' | |
| results_df['Literal_Original'] = '' | |
| results_df['Score_Similitud'] = 0.0 | |
| # Codificar cada literal nuevo | |
| for idx, row in results_df.iterrows(): | |
| literal = str(row['B']) | |
| result = encoder.encode_literal(literal, confidence_threshold) | |
| results_df.at[idx, 'C贸digos_Asignados'] = ( | |
| '; '.join(result['codes']) if result['codes'] | |
| else 'SIN_MATCH' | |
| ) | |
| results_df.at[idx, 'Literal_Original'] = result['matched_literal'] | |
| results_df.at[idx, 'Score_Similitud'] = round(result['similarity'], 3) | |
| # Generar estad铆sticas | |
| total = len(results_df) | |
| matched = len(results_df[results_df['C贸digos_Asignados'] != 'SIN_MATCH']) | |
| stats_df = pd.DataFrame({ | |
| 'M茅trica': [ | |
| 'Total Literales', | |
| 'Literales Codificados', | |
| 'Sin Coincidencia', | |
| 'Porcentaje 脡xito' | |
| ], | |
| 'Valor': [ | |
| total, | |
| matched, | |
| total - matched, | |
| f"{(matched/total*100):.1f}%" | |
| ] | |
| }) | |
| # Guardar resultados | |
| output_name = f"codificacion_literales_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" | |
| with pd.ExcelWriter(output_name) as writer: | |
| results_df.to_excel(writer, sheet_name='Resultados', index=False) | |
| stats_df.to_excel(writer, sheet_name='Resumen', index=False) | |
| training_df.to_excel(writer, sheet_name='Datos_Training', index=False) | |
| return output_name | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Interfaz Gradio | |
| iface = gr.Interface( | |
| fn=process_excel, | |
| inputs=[ | |
| gr.File(label="Excel con literales de entrenamiento (B: literales, C: c贸digos)"), | |
| gr.File(label="Excel con nuevos literales a codificar"), | |
| gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.7, | |
| label="Umbral de confianza (0-1)" | |
| ) | |
| ], | |
| outputs=gr.File(label="Excel con resultados"), | |
| title="Codificador Autom谩tico de Literales", | |
| description="Codifica autom谩ticamente literales bas谩ndose en ejemplos previos. Los c贸digos m煤ltiples deben estar separados por punto y coma (;) en la columna C." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |